Exemple #1
0
def test_read_chromsizes():

    d = """chr1\nchr2\nchr2"""
    with pytest.raises(ValueError):
        bioframe.read_chromsizes(StringIO(d))

    d = """chr1\t1\nchr3\t2\nchr2\t3\n """
    chromsizes = bioframe.read_chromsizes(StringIO(d))
    assert type(chromsizes) is pd.Series
    assert chromsizes.name == "length"
    assert list(chromsizes.index) == ["chr1", "chr2", "chr3"]
    assert list(chromsizes.values) == [1, 3, 2]
Exemple #2
0
def binnify(chromsizes_path, binsize, all_names):
    import bioframe

    chromsizes = bioframe.read_chromsizes(chromsizes_path,
                                          filter_chroms=not (all_names))
    bins = bioframe.binnify(chromsizes, binsize)
    print(bins.to_csv(sep="\t", index=False))
Exemple #3
0
def digest(chromsizes_path, fasta_path, enzyme_name):
    import bioframe
    chromsizes = bioframe.read_chromsizes(chromsizes_path, all_names=True)
    fasta_records = bioframe.load_fasta(fasta_path, engine='pyfaidx', as_raw=True)
    if not chromsizes.index.isin(fasta_records).all():
        raise ValueError("Some chromosomes mentioned in {}"
                         " are not found in {}".format(chromsizes_path, fasta_path))
    frags = bioframe.tools.digest(fasta_records, enzyme_name)
    print(frags.to_csv(sep='\t', index=False))
def test_binnify():
    chromsizes = bioframe.read_chromsizes(testdir +
                                          "/test_data/test.chrom.sizes",
                                          filter_chroms=False)
    assert len(chromsizes) == 2
    assert len(bioframe.binnify(chromsizes, int(np.max(
        chromsizes.values)))) == len(chromsizes)
    assert len(bioframe.binnify(chromsizes, int(np.min(
        chromsizes.values)))) == (len(chromsizes) + 1)
    assert len(bioframe.binnify(chromsizes, 1)) == np.sum(chromsizes.values)
def test_frac_gc():
    pytest.importorskip("pysam")
    chromsizes = bioframe.read_chromsizes(testdir +
                                          "/test_data/test.chrom.sizes",
                                          filter_chroms=False)
    fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa")

    unmapped_bp = (0 == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1),
                                             fasta_records,
                                             return_input=False).values)
    assert np.isnan(
        bioframe.frac_gc(
            bioframe.binnify(chromsizes, 1),
            fasta_records,
            return_input=False,
            mapped_only=True,
        ).values[unmapped_bp]).all()

    ## mapped_only=True should ignore N or return np.nan if interval only contains N
    np.testing.assert_equal(
        np.array([0.5, 0.5, np.nan]),
        bioframe.frac_gc(
            bioframe.binnify(chromsizes, 5),
            fasta_records,
            return_input=False,
            mapped_only=True,
        ).values,
    )

    assert (np.array([0.5, 0.5]) == bioframe.frac_gc(
        bioframe.binnify(chromsizes, 7),
        fasta_records,
        return_input=False,
        mapped_only=True,
    ).values).all()

    ## mapped_only=False should count N as zero
    assert (np.array([0.4, 0.4, 0]) == bioframe.frac_gc(
        bioframe.binnify(chromsizes, 5),
        fasta_records,
        return_input=False,
        mapped_only=False,
    ).values).all()

    assert (np.array([0.4, 2 / 7]) == bioframe.frac_gc(
        bioframe.binnify(chromsizes, 7),
        fasta_records,
        return_input=False,
        mapped_only=False,
    ).values).all()
def test_frac_mapped():
    pytest.importorskip("pysam")
    chromsizes = bioframe.read_chromsizes(testdir +
                                          "/test_data/test.chrom.sizes",
                                          filter_chroms=False)
    fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa")

    unmapped = np.array(
        [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0])
    assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1),
                                             fasta_records,
                                             return_input=False).values).all()

    unmapped = np.array([0.8, 0.8, 0])
    assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 5),
                                             fasta_records,
                                             return_input=False).values).all()

    unmapped = np.array([0.8, 4 / 7])
    assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 7),
                                             fasta_records,
                                             return_input=False).values).all()
Exemple #7
0
def binnify(chromsizes_path, binsize):
    import bioframe
    chromsizes = bioframe.read_chromsizes(chromsizes_path)
    bins = bioframe.tools.binnify(chromsizes, binsize)
    print(bins.to_csv(sep='\t', index=False))
Exemple #8
0
### Test API:
# common parameters:
ignore_diags = 2
clr_weight_name = "weight"
bad_bins = None
chunksize = 10_000  # keep it small to engage chunking
weight1 = clr_weight_name + "1"
weight2 = clr_weight_name + "2"
transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
assumed_binsize = 1_000_000

chromsizes_file = op.join(
    op.dirname(op.realpath(__file__)),
    "data/mm9.chrom.sizes.reduced",
)
chromsizes = bioframe.read_chromsizes(chromsizes_file)
chromosomes = list(chromsizes.index)
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]

# test the most frequent use cases, balancing applied, no bad bins, etc.

common_regions = []
for i in range(4):
    chrom = chromosomes[i]
    halfway_chrom = int(chromsizes[chrom] / 2)
    # make halfway_chrom point "bin-aligned" according to anticipated binsize
    halfway_chrom = round(halfway_chrom / assumed_binsize) * assumed_binsize
    reg1 = (chrom, 0, halfway_chrom)
    reg2 = (chrom, halfway_chrom, chromsizes[chrom])
    common_regions.append(reg1)
    common_regions.append(reg2)