def sequentialJaccardBed(): sequential_time = time.time() bed_pairs = jaccardBed() jaccard_scores = [] error_bed_pairs = [] for pair in bed_pairs: try: file1 = pr.read_bed(pair[0]) file2 = pr.read_bed(pair[1]) jaccard_score = file1.stats.jaccard(file2) jaccard_scores.append(jaccard_score) except Exception as error: print('error') print(error) error_bed_pair = [file1, file2] error_bed_pairs.append(error_bed_pair) print('scores') print(jaccard_scores) print('errors') print(error_bed_pairs) print('Sequentially calculating jaccard scores --- %.2f seconds ---' % (time.time() - sequential_time))
def calc_windowed_seg_sites(self, chrom=0, L=1e3, filt_rec=True, mask=None): """Calculate windowed estimates of segregating sites. Arguments: * chrom: identifier for the chromosome * L: length of independent locus * filt_rec: filter recombination * mask: bed file for the underlying mask """ assert self.chrom_pos_dict is not None phys_pos = self.chrom_physpos_dict[chrom] rec_pos = self.chrom_pos_dict[chrom] weights = self.chrom_weight_dict[chrom] if filt_rec: diff = np.abs(rec_pos[:-1] - rec_pos[1:]) idx = np.where(diff != 0)[0] phys_pos = phys_pos[idx] rec_pos = rec_pos[idx] weights = weights[idx] if mask is not None: phys_pos = phys_pos.astype(np.float64) df_mask = pyranges.read_bed(mask) df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1)) cov_sites = df_pos.coverage(df_mask) sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32)) idx = np.where(sites_idx > 0.0)[0] phys_pos[idx] = np.nan # 1. Setup the bins for the analysis bins = np.arange(np.nanmin(phys_pos), np.nanmax(phys_pos), L) windowed_vars, bin_edges = np.histogram( phys_pos[~np.isnan(phys_pos)], bins=bins, weights=weights[~np.isnan(phys_pos)], ) bin_edges = bin_edges.astype(np.uint32) # Interpolate the midpoints of the recombination bins f = interpolate.interp1d(phys_pos, rec_pos) midpts = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2 rec_midpts = f(midpts) # Calculate the weightings from the mask as needed ... mask_weights = np.ones(rec_midpts.size) if mask is not None: # Mask must be a bedfile df_windows = PyRanges( chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:] ) df_mask = pyranges.read_bed(mask) cov = df_windows.coverage(df_mask) mask_weights = np.array(cov.FractionOverlaps.astype(np.float32)) # Set the mask weights to scale up the fraction that may be missing! mask_weights = 1.0 / (1.0 - mask_weights) mask_weights[np.isinf(mask_weights)] = np.nan # Stacking all of the data to make sure that we can use it later on tot_data = np.vstack([windowed_vars, bin_edges[1:], rec_midpts, mask_weights]) self.chrom_total_dict[chrom] = tot_data
def countContexts(fastaFilePath, whiteListBed=None, blackListBed=None): debug(f"Starting to count contexts of nucleotides in {fastaFilePath}") triNucCounts = defaultdict(int) diNucCounts = defaultdict(int) # open the fastaFile with FastaFile(fastaFilePath) as fastaFile: # if we do not have a whitelist to start out, we make one from the fasta, which includes # everything if whiteListBed is None: wlObj = from_dict( { "Chromosome": fastaFile.references, "Start": [1] * fastaFile.nreferences, "End": fastaFile.lengths, } ) else: # we cast this to string, because pyranges wants string and we use the Path type wlObj = read_bed(str(whiteListBed)) wlObj = wlObj.merge() # if we have a blacklist, we subtract that from the whitelist, otherwise we leave it how # it is if not blackListBed is None: # we cast this to string, because pyranges wants string and we use the Path type blObj = read_bed(str(blackListBed)) blObj = blObj.merge() wlObj = wlObj.subtract(blObj) # shouldnt need to merge again here, as we only have less ranges than before # while we could use the get_fasta function from pyranges, it needs another # dependency (pyfaidx) and is slower (from my preliminary testing) # i terate over all chromosomes and each of the ranges for chr, df in wlObj: # iterrows has to return the index, even though we dont use it for idx, region in df.iterrows(): seq = fastaFile.fetch( reference=chr, start=region["Start"], end=region["End"] ) for i in range(len(seq) - 2): diNucCounts[seq[i : i + 2]] += 1 triNucCounts[seq[i : i + 3]] += 1 debug(f"contect frequency analysis complete for chromsome {chr}") return (diNucCounts, triNucCounts)
def exonOverlap(args, df): exon_frame = pr.read_bed(args.exons) exon_overlap = PyRanges(df).join(exon_frame).drop(like="_b") if exon_overlap.df.empty: exon_calls = pd.DataFrame() else: exon_calls = exon_overlap.df.drop( columns=['Chromosome', 'Start', 'End']).rename( columns={ 'Name': 'gene', 'Score': 'OMIM_syndrome' }).drop_duplicates() # if args.genelist: # #gene_list = pd.read_csv(args.genelist, sep='\t', names=['Gene'], header=None) exon_calls = exon_calls.merge(args.genelist, on=['gene'], how='left') exon_calls.fillna(value={ 'score': 0, 'normalized_score': 0 }, inplace=True) exon_calls = exon_calls.sort_values(by='score', ascending=False) return exon_calls
def geneOverlapTransloInv(args, sample_start, sample_end, sample_frame): gene_frame = pr.read_bed(args.genes) gene_start = PyRanges(sample_start).join(gene_frame[["Name", "Score" ]]).drop(like="_b") gene_end = PyRanges(sample_end).join(gene_frame[["Name", "Score"]]).drop(like="_b") if gene_start.df.empty and gene_end.df.empty: sample_frame['Name'] = sample_frame['Name2'] = sample_frame[ 'Score'] = sample_frame['Score2'] = 'None' elif gene_start.df.empty: sample_frame = gene_end.df.rename(columns={ 'Name': 'Name2', 'Score': 'Score2' }).filter(items=['SmapEntryID', 'Name']).drop_duplicates().merge( sample_frame, on=['SmapEntryID'], how='right') sample_frame['Name'] = sample_frame['Score'] = 'None' elif gene_end.df.empty: sample_frame = gene_start.df.filter( items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge( sample_frame, on=['SmapEntryID'], how='right') sample_frame['Name2'] = sample_frame['Score2'] = 'None' else: sample_frame = gene_start.df.filter( items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge( sample_frame, on=['SmapEntryID'], how='right') sample_frame = sample_frame.merge(gene_end.df.rename(columns={ 'Name': 'Name2', 'Score': 'Score2' }).filter(items=['SmapEntryID', 'Name2', 'Score2']), on=['SmapEntryID'], how='left') return (sample_frame)
def _read_intervals(gtf_path=None, bed_path=None, pranges=None, intervals=None, interval_attrs=None, duplicate_attr=False): alternatives = [bed_path, pranges, intervals, gtf_path] if sum(i is not None for i in alternatives) != 1: raise ValueError('only one of `gth_path`, `bed_path`, `pranges`,' '`intervals` or should given as input.') if gtf_path: import pyranges pranges = pyranges.read_gtf(gtf_path, duplicate_attr=duplicate_attr) elif bed_path: import pyranges pranges = pyranges.read_bed(bed_path) elif intervals: if interval_attrs is not None: raise ValueError( '`interval_attrs` is not valid with `intervals`') pranges = intervals_to_pyranges(intervals) return pranges
def polyAClusterDetected(fastaPath, infile, gene_bed, out_suffix, threads): # 读取gene_bed信息 try: os.mkdir(out_suffix) except: logger.warning(f"{out_suffix} existed!") gene_model = pr.read_bed(gene_bed, as_df=True) gene_model = gene_model.set_index(["Name"]) results = [] with ProcessPoolExecutor(max_workers=threads) as e: for gene_id in gene_model.index: if gene_model.at[gene_id, "Chromosome"] not in {"Mt", "Pt"}: results.append(e.submit(get_three_end, infile, gene_id, gene_model)) o_polya_cluster = open(f"{out_suffix}polya_cluster.bed", "w") o_polya_cluster_summit = open(f"{out_suffix}polya_cluster.summit.bed", "w") o_major_polya_cluster = open(f"{out_suffix}major_polya_cluster.bed", "w") o_major_polya_cluster_summit = open( f"{out_suffix}major_polya_cluster_summit.bed", "w" ) o_last_polya_cluster = open(f"{out_suffix}last_polya_cluster.bed", "w") o_last_polya_cluster_summit = open( f"{out_suffix}last_polya_cluster_summit.bed", "w" ) for res in results: result = res.result() if result is not None: ( polya_cluster, polya_cluster_summit, polya_cluster_major, polya_cluster_summit_major, polya_cluster_last, polya_cluster_summit_last, ) = result for item in polya_cluster: o_polya_cluster.write(item) for item in polya_cluster_summit: o_polya_cluster_summit.write(item) o_major_polya_cluster.write(polya_cluster_major) o_major_polya_cluster_summit.write(polya_cluster_summit_major) o_last_polya_cluster.write(polya_cluster_last) o_last_polya_cluster_summit.write(polya_cluster_summit_last) o_polya_cluster.close() o_polya_cluster_summit.close() o_major_polya_cluster.close() o_major_polya_cluster_summit.close() o_last_polya_cluster.close() o_last_polya_cluster_summit.close() filterPAC( fastaPath, f"{out_suffix}polya_cluster.bed", f"{out_suffix}polya_cluster.summit.bed", f"{out_suffix}polya_cluster.filtered.bed", )
def main(): # I/O directories input_data_dir = "input_data/" output_data_dir = "output_matrix/" working_dir = os.getcwd() # Sets the working directory input_dir = os.path.join( working_dir, input_data_dir) # get the path to the data input directory output_dir = os.path.join( working_dir, output_data_dir ) # Sets the directory where all the saved outputs will be stored if not os.path.exists(input_dir): os.makedirs(input_dir) # create input directory if not already present if not os.path.exists(output_dir): os.makedirs(output_dir) # DATA INPUT ========================================= # genes_lengths_path = os.path.join(input_dir,"gene_lengths.csv") genes_lengths_path = os.path.join( working_dir, "gene_lengths.csv" ) # path to upload the file containing each gene's ID and the correspondent gene length genes = pd.read_csv(genes_lengths_path) # removes a not useful column from the "genes" dataframe # renames column from the "genes" dataframe genes = genes.drop(genes.columns[0], axis=1) genes = genes.rename(columns={ genes.columns[0]: "GeneID", genes.columns[1]: "GeneLength" }) bed_table_FP_path_list = [ os.path.abspath(os.path.join(input_dir, f)) for f in listdir(input_dir) if isfile(os.path.join(input_dir, f)) if f.endswith(".bed") ] # list all the file names in input_dir # execute the coverage function for each .bed file for bed_table_FP_path in bed_table_FP_path_list: bed_file_name = Path(os.path.basename(bed_table_FP_path)).stem coverage_matrix_csv_path = os.path.join( output_dir, bed_file_name + "_matrix_coverage.csv") matrix_01_csv_path = os.path.join(output_dir, bed_file_name + "_matrix_01.csv") ############################### #### BED FILE FOOTPRINTS ###### ############################### # uploads the .bed files containing a list of gene's ID that have at least one FP, start and end point of mapping bed_table_FP = pr.read_bed(bed_table_FP_path, as_df=True) bed_table_FP_reduced = bed_table_FP.iloc[:, [0, 1, 2]] me = MatricesExtractor(bed_table_FP_reduced, genes) # extract the matrices matrix_01 = me.extract_matrices( addRand=m_addRand, areReadsRandomized=m_areReadsRandomized) # Exports the dataFrames into CSV files matrix_01.to_csv(matrix_01_csv_path, index=True)
def parse_bed(bed, window): logging.info("Parsing BED file") gr = pr.read_bed(bed)[window.chromosome, window.begin:window.end] df = gr.unstrand().df df = df.drop(columns=["Chromosome", "Score", "Strand"], errors='ignore') if "Name" not in df.columns: df["Name"] = "noname" return df.itertuples(index=False, name=None)
def multithreadedJaccardBed(*bed_pairs): # bedSortedPairs = jaccardBed() jaccard_scores = dict() error_bed_pairs = [] try: file1 = pr.read_bed(bed_pairs[0][0]) file2 = pr.read_bed(bed_pairs[0][1]) jaccard_score = file1.stats.jaccard(file2) scored_pair = (bed_pairs[0][0], bed_pairs[0][1]) jaccard_scores[str(scored_pair)] = jaccard_score except Exception as error: print('error') print(error) error_bed_pair = [file1, file2] error_bed_pairs.append(error_bed_pair) return jaccard_scores, error_bed_pairs
def __iter__(self) -> Iterable[Variant]: import pyranges as pr gr = pr.read_bed(self.bed_file) gr = gr.merge(strand=False).sort() for interval in pyranges_to_intervals(gr): yield from self.combination_variants(interval, self.variant_type)
def tidy_chromosome_column(polya_bed_path=None): ''' Read in polyA bed file Add 'chr' prefix to chromosome column return pyranges object ''' bed_df = pyr.read_bed(f=polya_bed_path, as_df=True) bed_df['Chromosome'] = 'chr' + bed_df['Chromosome'].astype(str) return pyr.PyRanges(bed_df)
def filterPAC(fastaPath, bedPath, bedSummitPath, fillterPolyASitePath): genomeFa = pyfastx.Fasta(fastaPath) polyAClusterBed = pr.read_bed(bedSummitPath, True) polyAClusterBed["seq"] = polyAClusterBed.apply( lambda x: genomeFa[x["Chromosome"]][x["Start"] - 10 : x["End"] + 10].seq, axis=1 ) polyAClusterBed["seqLength"] = polyAClusterBed["seq"].map(len) polyAClusterBed["Ratio"] = ( polyAClusterBed.apply( lambda x: x["seq"].count("A") if x["Strand"] == "+" else x["seq"].count("T"), axis=1, ) / polyAClusterBed["seqLength"] ) usePolyASite = polyAClusterBed.query("Ratio <= 0.5")["Name"] polyAClusterRawRangeBed = pr.read_bed(bedPath, True) polyAClusterPassedRangeBed = polyAClusterRawRangeBed.query("Name in @usePolyASite") polyAClusterPassedRangeBed.to_csv( fillterPolyASitePath, sep="\t", header=None, index=None )
def main(): args = parse_args() if len(args.trf) != 2: exit(f'ERROR: Expected 2 trf files, got {len(args.trf)}: {args.trf}') if len(args.cov) != 2: exit(f'ERROR: Expected 2 cov files, got {len(args.cov)}: {args.cov}') # Parse inputs as pandas data frame (df) or pyranges (pr) objects strling_df = parse_bed(args.strling) trf_hap1_df = parse_bed(args.trf[0]) trf_hap2_df = parse_bed(args.trf[1]) cov_hap1_pr = pr.read_bed(args.cov[0]) cov_hap2_pr = pr.read_bed(args.cov[1]) # Annotate strling calls with corresponding PacBio calls strling_df = match_variants(strling_df, trf_hap1_df, trf_hap2_df, args.slop) # Annotate strling calls with coverage overlap strling_df = annotate_cov(strling_df, cov_hap1_pr, cov_hap2_pr) strling_df.to_csv(args.out, sep='\t', index=False)
def f2(): """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | >>> # | (category) | (int32) | (int32) | (object) | (int64) | (category) | >>> # |--------------+-----------+-----------+------------+-----------+--------------| >>> # | chr1 | 1 | 2 | a | 0 | + | >>> # | chr1 | 6 | 7 | b | 0 | - | >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # Stranded PyRanges object has 2 rows and 6 columns from 1 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("f2.bed") return pr.read_bed(full_path)
def get_bed_files(self): bed_table_FP_path_list = [ os.path.abspath(os.path.join(self.input_bed_files_dir, f)) for f in listdir(self.input_bed_files_dir) if isfile(os.path.join(self.input_bed_files_dir, f)) if f.endswith(".bed") ] # list all the file names in input_dir bed_table_FPs = [{ "bed_file": pr.read_bed(f, as_df=True), "bed_file_name": Path(os.path.basename(f)).stem } for f in bed_table_FP_path_list] bed_table_FPs_reduced = [{ "bed_file": f["bed_file"].iloc[:, [0, 1, 2]], "bed_file_name": f["bed_file_name"] } for f in bed_table_FPs] return bed_table_FPs_reduced
def f1(): """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | >>> # | (category) | (int32) | (int32) | (object) | (int64) | (category) | >>> # |--------------+-----------+-----------+------------+-----------+--------------| >>> # | chr1 | 3 | 6 | interval1 | 0 | + | >>> # | chr1 | 8 | 9 | interval3 | 0 | + | >>> # | chr1 | 5 | 7 | interval2 | 0 | - | >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # Stranded PyRanges object has 3 rows and 6 columns from 1 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("f1.bed") return pr.read_bed(full_path)
def parseReadApaInfo(apaClusterPath, inBamPath, geneTag, expressionInfo): """ parse APA information, and classify each read into corresponding PAC """ apaCluster = pr.read_bed(apaClusterPath, True) apaCluster["Name"] = apaCluster["Name"] + "_APA" apaCluster["geneName"] = apaCluster["Name"].str.split("_").str[0] apaCluster = apaCluster.reindex(["geneName", "Name", "Start", "End"], axis=1) apaClusterDict = defaultdict(lambda: {}) for line in apaCluster.itertuples(): apaClusterDict[line.geneName][line.Name] = portion.closedopen( line.Start, line.End ) readsApaInfo = {} with pysam.AlignmentFile(inBamPath) as inBam: i = 0 for read in inBam: i += 1 readGene = read.get_tag(geneTag) geneApaInfo = apaClusterDict.get(readGene, "None") if geneApaInfo == "None": readApaName = f"{readGene}_N_APA" else: if read.is_reverse: readEndPos = read.positions[0] else: readEndPos = read.positions[-1] readApaName = f"{readGene}_N_APA" for apaName, apaSpan in geneApaInfo.items(): if readEndPos in apaSpan: readApaName = apaName break readsApaInfo[read.qname] = readApaName if i % 100000 == 0: logger.info(f"{i} reads processed") readsApaInfo = pd.Series(readsApaInfo) useUmi = list(set(readsApaInfo.index) & set(expressionInfo.index)) expressionInfo = pd.concat([expressionInfo, readsApaInfo]) expressionInfo = expressionInfo.loc[expressionInfo.index.isin(useUmi)] return expressionInfo
def vcf_to_pyranges(vcf, tmpfile): ''' create a bed-file containing the variant locations and ids ''' bedtool = BedTool((Interval(record.chrom, record.pos - 1, record.pos - 1 + len(record.ref), name=record.id) for record in vcf.fetch() if not len(record.alts) > 1)) bedtool.moveto(tmpfile) try: pr = pyranges.read_bed(tmpfile) del bedtool os.remove(tmpfile) except Exception as e: os.remove(tmpfile) raise e return pr
def lojs_overlap(feature_files, compare_pr): """ Function to run left outer join in features to all_regions_file Args: :param feature_files: list of paths to file to run intersection with all_regions_file :param compare_pr: pyranges object containing all regions of interest. Should have column 'idx'. Added in function epitome.functions.bed2Pyranges. :return arr: array same size as the number of genomic regions in all_regions_file """ if len(feature_files) == 0: logger.warn("WARN: lojs_overlap failed for all files %s with 0 lines" % ','.join(feature_files)) return np.zeros(len(compare_pr)) #### Number of files that must share a consensus #### if len(feature_files) <= 2: n = 1 # if there are 1-2 files just include all elif len(feature_files) >= 3 and len(feature_files) <= 7: n = 2 else: n = int(len(feature_files) / 4) # in 25% of files # Very slow: concatenate all bed files and only take regions with n overlap group_pr = pr.concat([pr.read_bed(i).merge() for i in feature_files]) group_pr = group_pr.merge(count=True).df group_pr = group_pr[group_pr['Count'] >= n] # Remove count column and save to bed file group_pr.drop('Count', inplace=True, axis=1) type_ = (compare_pr.Start.dtype == 'int64') pr1 = pr.PyRanges(group_pr, int64=type_) intersected = compare_pr.count_overlaps(pr1) arr = intersected.df.sort_values(by='idx')['NumberOverlaps'].values arr[arr > 0] = 1 return arr
def main(infile, gene_bed, out_suffix, threads): # 读取gene_bed信息 gene_model = pr.read_bed(gene_bed, as_df=True) gene_model = gene_model.set_index(['Name']) results = [] with ProcessPoolExecutor(max_workers=threads) as e: for gene_id in gene_model.index: if gene_model.at[gene_id, 'Chromosome'] not in {'Mt', 'Pt'}: results.append(e.submit(get_three_end, infile, gene_id, gene_model)) o_polya_cluster = open(f'{out_suffix}.polya_cluster.bed', 'w') o_polya_cluster_summit = open(f'{out_suffix}.polya_cluster.summit.bed', 'w') o_major_polya_cluster = open(f'{out_suffix}.major_polya_cluster.bed', 'w') o_major_polya_cluster_summit = open(f'{out_suffix}.major_polya_cluster_summit.bed', 'w') o_last_polya_cluster = open(f'{out_suffix}.last_polya_cluster.bed', 'w') o_last_polya_cluster_summit = open(f'{out_suffix}.last_polya_cluster_summit.bed', 'w') for res in results: result = res.result() if result is not None: polya_cluster, polya_cluster_summit, polya_cluster_major, polya_cluster_summit_major, polya_cluster_last, polya_cluster_summit_last = result for item in polya_cluster: o_polya_cluster.write(item) for item in polya_cluster_summit: o_polya_cluster_summit.write(item) o_major_polya_cluster.write(polya_cluster_major) o_major_polya_cluster_summit.write(polya_cluster_summit_major) o_last_polya_cluster.write(polya_cluster_last) o_last_polya_cluster_summit.write(polya_cluster_summit_last) o_polya_cluster.close() o_polya_cluster_summit.close() o_major_polya_cluster.close() o_major_polya_cluster_summit.close() o_last_polya_cluster.close() o_last_polya_cluster_summit.close()
def parse_bed_files(bed_files): """Creates PyRanges objects from the BED files.""" # Skip if no BED files are provided if len(bed_files) == 0: return # Load BED files beds = [pr.read_bed(b) for b in bed_files] # Check that all BED files have the first four columns for bed_file, bed in zip(bed_files, beds): assert "Name" in bed.columns, f"Name (column 4) missing from {bed_file}." # Concatenate BED files and only keep Name column bed = pr.concat(beds) bed = bed.unstrand() bed = bed[["Name"]] # Ensure unique names assert bed.Name.is_unique, "Names (column 4) not unique across BED files." return bed
def chipseq_background(): """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | >>> # | (category) | (int32) | (int32) | (object) | (int64) | (category) | >>> # |--------------+-----------+-----------+------------+-----------+--------------| >>> # | chr1 | 39036822 | 39036847 | U0 | 0 | + | >>> # | chr1 | 224145989 | 224146014 | U0 | 0 | + | >>> # | chr1 | 167802964 | 167802989 | U0 | 0 | + | >>> # | chr1 | 69101066 | 69101091 | U0 | 0 | + | >>> # | ... | ... | ... | ... | ... | ... | >>> # | chrY | 11936866 | 11936891 | U0 | 0 | - | >>> # | chrY | 10629111 | 10629136 | U0 | 0 | - | >>> # | chrY | 10632456 | 10632481 | U0 | 0 | - | >>> # | chrY | 11918814 | 11918839 | U0 | 0 | - | >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # Stranded PyRanges object has 10,000 rows and 6 columns from 25 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("chipseq_background.bed") return pr.read_bed(full_path)
def chipseq(): """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | >>> # | (category) | (int32) | (int32) | (object) | (int64) | (category) | >>> # |--------------+-----------+-----------+------------+-----------+--------------| >>> # | chr1 | 212609534 | 212609559 | U0 | 0 | + | >>> # | chr1 | 169887529 | 169887554 | U0 | 0 | + | >>> # | chr1 | 216711011 | 216711036 | U0 | 0 | + | >>> # | chr1 | 144227079 | 144227104 | U0 | 0 | + | >>> # | ... | ... | ... | ... | ... | ... | >>> # | chrY | 15224235 | 15224260 | U0 | 0 | - | >>> # | chrY | 13517892 | 13517917 | U0 | 0 | - | >>> # | chrY | 8010951 | 8010976 | U0 | 0 | - | >>> # | chrY | 7405376 | 7405401 | U0 | 0 | - | >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # Stranded PyRanges object has 10,000 rows and 6 columns from 24 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("chipseq.bed") return pr.read_bed(full_path)
def chromsizes(): """ >>> # +--------------+-----------+-----------+ >>> # | Chromosome | Start | End | >>> # | (category) | (int32) | (int32) | >>> # |--------------+-----------+-----------| >>> # | chr1 | 0 | 249250621 | >>> # | chr2 | 0 | 243199373 | >>> # | chr3 | 0 | 198022430 | >>> # | chr4 | 0 | 191154276 | >>> # | ... | ... | ... | >>> # | chrY | 0 | 59373566 | >>> # | chrX | 0 | 155270560 | >>> # | chrM | 0 | 16571 | >>> # | chr22 | 0 | 51304566 | >>> # +--------------+-----------+-----------+ >>> # Unstranded PyRanges object has 25 rows and 3 columns from 25 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome. """ full_path = get_example_path("chromsizes.bed") return pr.read_bed(full_path)
def exons(): """ >>> # +--------------+-----------+-----------+----------------------------------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | >>> # | (category) | (int32) | (int32) | (object) | (int64) | (category) | >>> # |--------------+-----------+-----------+----------------------------------------+-----------+--------------| >>> # | chrX | 135721701 | 135721963 | NR_038462_exon_0_0_chrX_135721702_f | 0 | + | >>> # | chrX | 135574120 | 135574598 | NM_001727_exon_2_0_chrX_135574121_f | 0 | + | >>> # | chrX | 47868945 | 47869126 | NM_205856_exon_4_0_chrX_47868946_f | 0 | + | >>> # | chrX | 77294333 | 77294480 | NM_000052_exon_17_0_chrX_77294334_f | 0 | + | >>> # | ... | ... | ... | ... | ... | ... | >>> # | chrY | 15409586 | 15409728 | NR_047633_exon_3_0_chrY_15409587_r | 0 | - | >>> # | chrY | 15478146 | 15478273 | NR_047634_exon_18_0_chrY_15478147_r | 0 | - | >>> # | chrY | 15360258 | 15361762 | NR_047601_exon_0_0_chrY_15360259_r | 0 | - | >>> # | chrY | 15467254 | 15467278 | NM_001258270_exon_13_0_chrY_15467255_r | 0 | - | >>> # +--------------+-----------+-----------+----------------------------------------+-----------+--------------+ >>> # Stranded PyRanges object has 1,000 rows and 6 columns from 2 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("exons.bed") return pr.read_bed(full_path)
def aorta(): """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | >>> # | (category) | (int32) | (int32) | (object) | (int64) | (category) | >>> # |--------------+-----------+-----------+------------+-----------+--------------| >>> # | chr1 | 9939 | 10138 | H3K27me3 | 7 | + | >>> # | chr1 | 9953 | 10152 | H3K27me3 | 5 | + | >>> # | chr1 | 10024 | 10223 | H3K27me3 | 1 | + | >>> # | chr1 | 10246 | 10445 | H3K27me3 | 4 | + | >>> # | ... | ... | ... | ... | ... | ... | >>> # | chr1 | 9978 | 10177 | H3K27me3 | 7 | - | >>> # | chr1 | 10001 | 10200 | H3K27me3 | 5 | - | >>> # | chr1 | 10127 | 10326 | H3K27me3 | 1 | - | >>> # | chr1 | 10241 | 10440 | H3K27me3 | 6 | - | >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # Stranded PyRanges object has 11 rows and 6 columns from 1 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("aorta.bed") return pr.read_bed(full_path)
def aorta2(): """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | >>> # | (category) | (int32) | (int32) | (object) | (int64) | (category) | >>> # |--------------+-----------+-----------+------------+-----------+--------------| >>> # | chr1 | 10073 | 10272 | Input | 1 | + | >>> # | chr1 | 10280 | 10479 | Input | 1 | + | >>> # | chr1 | 16056 | 16255 | Input | 1 | + | >>> # | chr1 | 16064 | 16263 | Input | 1 | + | >>> # | ... | ... | ... | ... | ... | ... | >>> # | chr1 | 10079 | 10278 | Input | 1 | - | >>> # | chr1 | 10082 | 10281 | Input | 1 | - | >>> # | chr1 | 10149 | 10348 | Input | 1 | - | >>> # | chr1 | 19958 | 20157 | Input | 1 | - | >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # Stranded PyRanges object has 10 rows and 6 columns from 1 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("aorta2.bed") return pr.read_bed(full_path)
def test_pyranges_to_intervals(): pranges = pyranges.read_gtf(gtf_file) intervals = list( pyranges_to_intervals(pranges, interval_attrs=[ 'gene_id', 'gene_name', 'transcript_id', 'exon_id' ])) assert len(intervals) == 5 assert intervals[4].attrs['gene_id'] == 'ENSG00000012048' assert intervals[4].attrs['gene_name'] == 'BRCA1' assert intervals[4].attrs['transcript_id'] == 'ENST00000357654' assert intervals[4].attrs['exon_id'] == 'ENSE00003510592' pranges = pyranges.read_bed(example_intervals_bed) intervals = list(pyranges_to_intervals(pranges)) assert len(intervals) == 4 assert intervals[0].start == 2 assert pranges.Chromosome.tolist() == ['chr1'] * 4 assert pranges.Start.tolist() == [2, 2, 2, 602] assert pranges.End.tolist() == [1000, 5000, 1002, 604]
def dfunc(self, *args, **kwargs): if not self.initialized: self.data = pyranges.read_bed(self.input_path) self.initialized = True return func(self, *args, **kwargs)