コード例 #1
0
def file_to_unbinned_ranges(f, args, _):

    file_format = sniff(f)

    names = "Chromosome Start End Strand".split()
    if file_format == "bed" or "bed.gz":
        df = pd.read_csv(f,
                         sep="\t",
                         header=None,
                         usecols=[0, 1, 2, 5],
                         names=names)
        gr = pr.PyRanges(df)
    elif file_format == "bedpe":
        df = pd.read_csv(f,
                         sep="\t",
                         header=None,
                         usecols=[0, 1, 5, 9],
                         names=names)
        gr = pr.PyRanges(df)
    elif file_format == "bampe":
        # Shouldn't be reachable at the moment, bampe format is not supported here yet.
        raise NotImplementedError(
            "bampe format is not supported at the moment")
    else:
        gr = pr.read_bam(f)

    if args["drop_duplicates"]:
        gr = gr.drop_duplicate_positions()

    gr = pr.gf.genome_bounds(gr, args["chromsizes_"])
    # print("args", args)# ["chromsizes"])
    return gr  #.remove_out_of_genome_bounds_intervals()
コード例 #2
0
def file_to_unbinned_ranges(f, args, _):

    file_format = sniff(f)

    names = "Chromosome Start End Strand".split()
    if file_format == "bed" or "bed.gz":
        df = pd.read_csv(f,
                         sep="\t",
                         header=None,
                         usecols=[0, 1, 2, 5],
                         names=names)
        gr = pr.PyRanges(df)
    elif file_format == "bedpe":
        df = pd.read_csv(f,
                         sep="\t",
                         header=None,
                         usecols=[0, 1, 5, 9],
                         names=names)
        gr = pr.PyRanges(df)
    else:
        gr = pr.read_bam(f)

    if args["drop_duplicates"]:
        gr = gr.drop_duplicate_positions()

    gr = pr.gf.genome_bounds(gr, args["chromsizes_"])
    # print("args", args)# ["chromsizes"])
    return gr  #.remove_out_of_genome_bounds_intervals()
コード例 #3
0
def file_to_unbinned_ranges(f, args, _):

    chromosome_data = {}

    file_format = sniff(f)

    names = "Chromosome Start End Strand".split()
    if file_format == "bed" or "bed.gz":
        df = pd.read_csv(f,
                         sep="\t",
                         header=None,
                         usecols=[0, 1, 2, 5],
                         names=names)
        gr = pr.PyRanges(df)
    elif file_format == "bedpe":
        df = pd.read_csv(f,
                         sep="\t",
                         header=None,
                         usecols=[0, 1, 5, 9],
                         names=names)
        gr = pr.PyRanges(df)
    else:
        gr = pr.read_bam(f)

    return gr
コード例 #4
0
def control_bam():


    """
    >>> # +--------------+-----------+-----------+--------------+------------+
    >>> # | Chromosome   | Start     | End       | Strand       | Flag       |
    >>> # | (category)   | (int32)   | (int32)   | (category)   | (uint16)   |
    >>> # |--------------+-----------+-----------+--------------+------------|
    >>> # | chr1         | 887771    | 887796    | +            | 16         |
    >>> # | chr1         | 994660    | 994685    | +            | 16         |
    >>> # | chr1         | 1770383   | 1770408   | +            | 16         |
    >>> # | chr1         | 1995141   | 1995166   | +            | 16         |
    >>> # | ...          | ...       | ...       | ...          | ...        |
    >>> # | chrY         | 57402214  | 57402239  | +            | 16         |
    >>> # | chrY         | 10643526  | 10643551  | -            | 0          |
    >>> # | chrY         | 11776321  | 11776346  | -            | 0          |
    >>> # | chrY         | 20557165  | 20557190  | -            | 0          |
    >>> # +--------------+-----------+-----------+--------------+------------+
    >>> # Stranded PyRanges object has 10,000 rows and 5 columns from 25 chromosomes.
    >>> # For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    full_path = get_example_path("control.bam")

    return pr.read_bam(full_path)
コード例 #5
0
def read_bam_bin_counts(bins: PyRanges, bams: Dict[str, str], excluded: PyRanges = None, **kwargs) -> AnnData:
    """ Count reads in bins from bams

    Parameters
    ----------
    bins : pyranges.PyRanges
        bins in which to count reads
    bams : Dict[Str]
        bam filenames with cell ids as keys
    excluded: PyRanges
        excluded genomic regions to filter reads

    Returns
    -------
    ad.AnnData
        binned read counts
    """

    bin_data = _convert_pyranges(bins)
    bin_data = _add_bin_index(bin_data)

    cn_matrix = {}

    for cell_id, cell_bam in bams.items():
        logging.info(f"reading {cell_bam}")
        bam_data = pr.read_bam(cell_bam, **kwargs)

        if excluded is not None:
            logging.info("excluding reads")
            bam_data = bam_data.intersect(excluded, invert=True)

        logging.info(f"count overlaps")
        bam_data = bam_data.intersect(bins, how='containment')
        read_counts = bins.count_overlaps(bam_data, overlap_col='reads')

        read_counts = _convert_pyranges(read_counts)
        read_counts = _add_bin_index(read_counts)

        cn_matrix[cell_id] = read_counts['reads']

    cn_matrix = pd.DataFrame(cn_matrix)

    cell_data = pd.DataFrame({'cell_id': cn_matrix.columns.values}).set_index('cell_id')

    adata = ad.AnnData(
        cn_matrix.T,
        obs=cell_data,
        var=bin_data,
    )

    return adata
コード例 #6
0
def control_bam():

    full_path = get_example_path("control.bam")

    return pr.read_bam(full_path)
コード例 #7
0
def test_read_bam():

    pr.read_bam("tests/test_data/test_sorted.bam")
コード例 #8
0
 def read_bfile(self):
     self.check_bfile()
     if self.filetype == Type.BED:
         return pr.read_bed(self.bfile)
     return pr.read_bam(self.bfile)
コード例 #9
0
 def get_bam_ranges(self, filter_flag=3844):
     logging.info("Extracting BAM ranges")
     # note that read_bam by default has a specific set of SAM flags
     # this code needs to be updated to allow for selection of +/-, primary
     # secondary etc ... - this method is also independent of pysam//samtools
     return pr.read_bam(self.bam, filter_flag=3844)