def file_to_unbinned_ranges(f, args, _): file_format = sniff(f) names = "Chromosome Start End Strand".split() if file_format == "bed" or "bed.gz": df = pd.read_csv(f, sep="\t", header=None, usecols=[0, 1, 2, 5], names=names) gr = pr.PyRanges(df) elif file_format == "bedpe": df = pd.read_csv(f, sep="\t", header=None, usecols=[0, 1, 5, 9], names=names) gr = pr.PyRanges(df) elif file_format == "bampe": # Shouldn't be reachable at the moment, bampe format is not supported here yet. raise NotImplementedError( "bampe format is not supported at the moment") else: gr = pr.read_bam(f) if args["drop_duplicates"]: gr = gr.drop_duplicate_positions() gr = pr.gf.genome_bounds(gr, args["chromsizes_"]) # print("args", args)# ["chromsizes"]) return gr #.remove_out_of_genome_bounds_intervals()
def file_to_unbinned_ranges(f, args, _): file_format = sniff(f) names = "Chromosome Start End Strand".split() if file_format == "bed" or "bed.gz": df = pd.read_csv(f, sep="\t", header=None, usecols=[0, 1, 2, 5], names=names) gr = pr.PyRanges(df) elif file_format == "bedpe": df = pd.read_csv(f, sep="\t", header=None, usecols=[0, 1, 5, 9], names=names) gr = pr.PyRanges(df) else: gr = pr.read_bam(f) if args["drop_duplicates"]: gr = gr.drop_duplicate_positions() gr = pr.gf.genome_bounds(gr, args["chromsizes_"]) # print("args", args)# ["chromsizes"]) return gr #.remove_out_of_genome_bounds_intervals()
def file_to_unbinned_ranges(f, args, _): chromosome_data = {} file_format = sniff(f) names = "Chromosome Start End Strand".split() if file_format == "bed" or "bed.gz": df = pd.read_csv(f, sep="\t", header=None, usecols=[0, 1, 2, 5], names=names) gr = pr.PyRanges(df) elif file_format == "bedpe": df = pd.read_csv(f, sep="\t", header=None, usecols=[0, 1, 5, 9], names=names) gr = pr.PyRanges(df) else: gr = pr.read_bam(f) return gr
def control_bam(): """ >>> # +--------------+-----------+-----------+--------------+------------+ >>> # | Chromosome | Start | End | Strand | Flag | >>> # | (category) | (int32) | (int32) | (category) | (uint16) | >>> # |--------------+-----------+-----------+--------------+------------| >>> # | chr1 | 887771 | 887796 | + | 16 | >>> # | chr1 | 994660 | 994685 | + | 16 | >>> # | chr1 | 1770383 | 1770408 | + | 16 | >>> # | chr1 | 1995141 | 1995166 | + | 16 | >>> # | ... | ... | ... | ... | ... | >>> # | chrY | 57402214 | 57402239 | + | 16 | >>> # | chrY | 10643526 | 10643551 | - | 0 | >>> # | chrY | 11776321 | 11776346 | - | 0 | >>> # | chrY | 20557165 | 20557190 | - | 0 | >>> # +--------------+-----------+-----------+--------------+------------+ >>> # Stranded PyRanges object has 10,000 rows and 5 columns from 25 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. """ full_path = get_example_path("control.bam") return pr.read_bam(full_path)
def read_bam_bin_counts(bins: PyRanges, bams: Dict[str, str], excluded: PyRanges = None, **kwargs) -> AnnData: """ Count reads in bins from bams Parameters ---------- bins : pyranges.PyRanges bins in which to count reads bams : Dict[Str] bam filenames with cell ids as keys excluded: PyRanges excluded genomic regions to filter reads Returns ------- ad.AnnData binned read counts """ bin_data = _convert_pyranges(bins) bin_data = _add_bin_index(bin_data) cn_matrix = {} for cell_id, cell_bam in bams.items(): logging.info(f"reading {cell_bam}") bam_data = pr.read_bam(cell_bam, **kwargs) if excluded is not None: logging.info("excluding reads") bam_data = bam_data.intersect(excluded, invert=True) logging.info(f"count overlaps") bam_data = bam_data.intersect(bins, how='containment') read_counts = bins.count_overlaps(bam_data, overlap_col='reads') read_counts = _convert_pyranges(read_counts) read_counts = _add_bin_index(read_counts) cn_matrix[cell_id] = read_counts['reads'] cn_matrix = pd.DataFrame(cn_matrix) cell_data = pd.DataFrame({'cell_id': cn_matrix.columns.values}).set_index('cell_id') adata = ad.AnnData( cn_matrix.T, obs=cell_data, var=bin_data, ) return adata
def control_bam(): full_path = get_example_path("control.bam") return pr.read_bam(full_path)
def test_read_bam(): pr.read_bam("tests/test_data/test_sorted.bam")
def read_bfile(self): self.check_bfile() if self.filetype == Type.BED: return pr.read_bed(self.bfile) return pr.read_bam(self.bfile)
def get_bam_ranges(self, filter_flag=3844): logging.info("Extracting BAM ranges") # note that read_bam by default has a specific set of SAM flags # this code needs to be updated to allow for selection of +/-, primary # secondary etc ... - this method is also independent of pysam//samtools return pr.read_bam(self.bam, filter_flag=3844)