Exemple #1
0
def digest(chromsizes_path, fasta_path, enzyme_name):
    import bioframe
    chromsizes = bioframe.read_chromsizes(chromsizes_path, all_names=True)
    fasta_records = bioframe.load_fasta(fasta_path, engine='pyfaidx', as_raw=True)
    if not chromsizes.index.isin(fasta_records).all():
        raise ValueError("Some chromosomes mentioned in {}"
                         " are not found in {}".format(chromsizes_path, fasta_path))
    frags = bioframe.tools.digest(fasta_records, enzyme_name)
    print(frags.to_csv(sep='\t', index=False))
def test_digest():
    pytest.importorskip("Bio")
    fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa")
    assert len(fasta_records) == 2
    ### no HindIII sites in the test.fa fasta records, so shouldn't change shape[0]
    assert bioframe.digest(fasta_records, "HindIII").shape == (2, 3)
    ### one DpnII site on chrTEST2, shape[0] should increase by one
    assert bioframe.digest(fasta_records, "DpnII").shape == (3, 3)
    ### DpnII site is on chrTEST2 position 3, first interval of chrTEST2 should end at 3
    assert bioframe.digest(fasta_records, "DpnII").iloc[1].end == 3
Exemple #3
0
def gc(bins_path, fasta_path, mapped_only):
    import bioframe
    import pandas as pd
    bins = pd.read_table(bins_path)
    chromosomes = bins['chrom'].unique()
    fasta_records = bioframe.load_fasta(fasta_path, engine='pyfaidx', as_raw=True)
    if any(chrom not in fasta_records.keys() for chrom in chromosomes):
        raise ValueError("Some chromosomes mentioned in {}"
                         " are not found in {}".format(bins_path, fasta_path))
    bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records, mapped_only)
    print(bins.to_csv(sep='\t', index=False))
Exemple #4
0
def gene_content(genome, binsize, gc=True):

    chrom_sizes = bioframe.fetch_chromsizes(genome)
    chrom_table = binnify(chrom_sizes, binsize)

    gene_count = frac_gene_coverage(chrom_table, genome)
    if gc:
        fasta_path = f'/net/levsha/share/lab/genomes/{genome}/{genome}.fa'
        fasta_records = load_fasta(fasta_path)
        gene_count['frac_gc'] = frac_gc(chrom_table, fasta_records)

    return gene_count
def test_frac_gc():
    pytest.importorskip("pysam")
    chromsizes = bioframe.read_chromsizes(testdir +
                                          "/test_data/test.chrom.sizes",
                                          filter_chroms=False)
    fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa")

    unmapped_bp = (0 == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1),
                                             fasta_records,
                                             return_input=False).values)
    assert np.isnan(
        bioframe.frac_gc(
            bioframe.binnify(chromsizes, 1),
            fasta_records,
            return_input=False,
            mapped_only=True,
        ).values[unmapped_bp]).all()

    ## mapped_only=True should ignore N or return np.nan if interval only contains N
    np.testing.assert_equal(
        np.array([0.5, 0.5, np.nan]),
        bioframe.frac_gc(
            bioframe.binnify(chromsizes, 5),
            fasta_records,
            return_input=False,
            mapped_only=True,
        ).values,
    )

    assert (np.array([0.5, 0.5]) == bioframe.frac_gc(
        bioframe.binnify(chromsizes, 7),
        fasta_records,
        return_input=False,
        mapped_only=True,
    ).values).all()

    ## mapped_only=False should count N as zero
    assert (np.array([0.4, 0.4, 0]) == bioframe.frac_gc(
        bioframe.binnify(chromsizes, 5),
        fasta_records,
        return_input=False,
        mapped_only=False,
    ).values).all()

    assert (np.array([0.4, 2 / 7]) == bioframe.frac_gc(
        bioframe.binnify(chromsizes, 7),
        fasta_records,
        return_input=False,
        mapped_only=False,
    ).values).all()
Exemple #6
0
def gc(bins_path, fasta_path, mapped_only):
    import bioframe
    import pandas as pd

    if bins_path == "-":
        bins_path = sys.stdin
    bins = pd.read_table(bins_path)
    chromosomes = bins["chrom"].unique()
    fasta_records = bioframe.load_fasta(fasta_path,
                                        engine="pyfaidx",
                                        as_raw=True)
    if any(chrom not in fasta_records.keys() for chrom in chromosomes):
        raise ValueError("Some chromosomes mentioned in {}"
                         " are not found in {}".format(bins_path, fasta_path))
    bins = bioframe.frac_gc(bins, fasta_records, mapped_only)
    print(bins.to_csv(sep="\t", index=False))
def test_frac_mapped():
    pytest.importorskip("pysam")
    chromsizes = bioframe.read_chromsizes(testdir +
                                          "/test_data/test.chrom.sizes",
                                          filter_chroms=False)
    fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa")

    unmapped = np.array(
        [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0])
    assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1),
                                             fasta_records,
                                             return_input=False).values).all()

    unmapped = np.array([0.8, 0.8, 0])
    assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 5),
                                             fasta_records,
                                             return_input=False).values).all()

    unmapped = np.array([0.8, 4 / 7])
    assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 7),
                                             fasta_records,
                                             return_input=False).values).all()
import multiprocess as mp
import numpy as np
import pandas as pd
import bioframe
import cooltools
import cooler
from cooltools.eigdecomp import cooler_cis_eig

mm10 = bioframe.fetch_chromsizes('mm10')
chromsizes = bioframe.fetch_chromsizes('mm10')
chromosomes = list(chromsizes.index)

binsize = 10000
bins = cooler.binnify(mm10, binsize)
fasta_records = bioframe.load_fasta('/data05/genomes/mm10_20chr.fa')
bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records)
bins.head()

import fnmatch
import os

for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*_10kb.cool'):
        clr = cooler.Cooler(file)
        cond = file.split('.')[0]
        lam, eigs = cooler_cis_eig(clr,
                                   bins,
                                   n_eigs=3,
                                   phasing_track_col='GC',
                                   sort_metric='var_explained')