def test_binnify(): chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) assert len(chromsizes) == 2 assert len(bioframe.binnify(chromsizes, int(np.max( chromsizes.values)))) == len(chromsizes) assert len(bioframe.binnify(chromsizes, int(np.min( chromsizes.values)))) == (len(chromsizes) + 1) assert len(bioframe.binnify(chromsizes, 1)) == np.sum(chromsizes.values)
def binnify(chromsizes_path, binsize, all_names): import bioframe chromsizes = bioframe.read_chromsizes(chromsizes_path, filter_chroms=not (all_names)) bins = bioframe.binnify(chromsizes, binsize) print(bins.to_csv(sep="\t", index=False))
def gene_content(genome, binsize, gc=True): chrom_sizes = bioframe.fetch_chromsizes(genome) chrom_table = binnify(chrom_sizes, binsize) gene_count = frac_gene_coverage(chrom_table, genome) if gc: fasta_path = f'/net/levsha/share/lab/genomes/{genome}/{genome}.fa' fasta_records = load_fasta(fasta_path) gene_count['frac_gc'] = frac_gc(chrom_table, fasta_records) return gene_count
def test_frac_mapped(): pytest.importorskip("pysam") chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa") unmapped = np.array( [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1), fasta_records, return_input=False).values).all() unmapped = np.array([0.8, 0.8, 0]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 5), fasta_records, return_input=False).values).all() unmapped = np.array([0.8, 4 / 7]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 7), fasta_records, return_input=False).values).all()
def test_frac_gc(): pytest.importorskip("pysam") chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa") unmapped_bp = (0 == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1), fasta_records, return_input=False).values) assert np.isnan( bioframe.frac_gc( bioframe.binnify(chromsizes, 1), fasta_records, return_input=False, mapped_only=True, ).values[unmapped_bp]).all() ## mapped_only=True should ignore N or return np.nan if interval only contains N np.testing.assert_equal( np.array([0.5, 0.5, np.nan]), bioframe.frac_gc( bioframe.binnify(chromsizes, 5), fasta_records, return_input=False, mapped_only=True, ).values, ) assert (np.array([0.5, 0.5]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 7), fasta_records, return_input=False, mapped_only=True, ).values).all() ## mapped_only=False should count N as zero assert (np.array([0.4, 0.4, 0]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 5), fasta_records, return_input=False, mapped_only=False, ).values).all() assert (np.array([0.4, 2 / 7]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 7), fasta_records, return_input=False, mapped_only=False, ).values).all()