Exemple #1
0
def test_annotation_init(caplog, annot):
    assert annot.genome_file.endswith("data/regexp/regexp.fa")
    assert annot.annotation_gtf_file.endswith("data/regexp/regexp.annotation.gtf")
    assert annot.annotation_bed_file.endswith("data/regexp/regexp.annotation.bed")

    assert annot.annotation_contigs == ["chrM"]  # from BED
    assert len(annot.genome_contigs) == 17  # from sizes
    assert isinstance(annot.bed, pd.DataFrame)
    assert isinstance(annot.gtf, pd.DataFrame)
    assert annot.genes("bed") == ["NP_059343.1"]

    # >1 GTF files: take unzipped
    g1 = "tests/data/data.annotation.gtf"
    with genomepy.files._open(g1, "w") as fa:
        fa.write("1")
    g2 = "tests/data/data.annotation.gtf.gz"
    with genomepy.files._open(g2, "w") as fa:
        fa.write("2")
    a = genomepy.Annotation("data", genomes_dir="tests")
    assert a.annotation_gtf_file.endswith(g1)
    genomepy.utils.rm_rf(g1)
    genomepy.utils.rm_rf(g2)

    # not enough GTF files
    genomepy.Annotation("empty", genomes_dir="tests/data")
    assert "Could not find 'empty.annotation.bed" in caplog.text
    assert "Could not find 'empty.annotation.gtf" in caplog.text

    # Genome doesn't exist
    with pytest.raises(FileNotFoundError):
        genomepy.Annotation("never_existed", genomes_dir="tests/data")
Exemple #2
0
def test_named_gtf():
    a = genomepy.Annotation("sacCer3", genomes_dir="tests/data")
    df = a.named_gtf
    assert df.index.name == "gene_name"
    assert str(a.gtf.index.dtype) == "int64"
    assert str(df.index.dtype) == "object"
    assert set(df.at["YDL248W", "seqname"]) == {"chrIV"}
Exemple #3
0
def test_match_contigs():
    # nothing to work with
    a = genomepy.Annotation("sacCer3", genomes_dir="tests/data")
    cd = genomepy.annotation.sanitize._match_contigs(a)
    assert cd is None

    # one missing contig, one fixable contig
    a = genomepy.Annotation("sanitize", genomes_dir="tests/data")
    before = a.gtf
    cd = genomepy.annotation.sanitize._match_contigs(a)
    after = a.gtf

    assert cd == {"NC_007112.7": "1"}
    assert a.genome_contigs == ["1"]
    assert list(before.seqname.unique()) == ["NC_007112.7", "NC_002333.2"]
    assert list(after.seqname.unique()) == ["1", "NC_002333.2"]
Exemple #4
0
def test_custom_annotation():
    for fname in [
        "tests/data/custom.annotation.bed",
        "tests/data/custom.annotation.bed.gz",
    ]:
        a = genomepy.Annotation(name=fname, genomes_dir="tests/data")
        assert a.bed.shape[0] == 10

    for fname in [
        "tests/data/custom.annotation.gtf",
        "tests/data/custom.annotation.gtf.gz",
    ]:
        a = genomepy.Annotation(name=fname, genomes_dir="tests/data")
        assert a.gtf.shape[0] == 45

    with pytest.raises(NotImplementedError):
        a = genomepy.Annotation(name="tests/data/regions.txt", genomes_dir="tests/data")
Exemple #5
0
def test_filter_contigs():
    a = genomepy.Annotation("sacCer3", genomes_dir="tests/data")
    assert "chrV" not in a.bed.chrom.unique()

    # add a chromosome to the BED not present in the genome
    a.bed.at[0, "chrom"] = "chrV"
    assert "chrV" in a.bed.chrom.unique()

    missing_contigs = genomepy.annotation.sanitize._filter_contigs(a)
    assert missing_contigs == {"chrV"}
    assert "chrV" not in a.bed.chrom.unique()
Exemple #6
0
def test__parse_annot():
    a = genomepy.Annotation("sacCer3", genomes_dir="tests/data")
    df = genomepy.annotation.utils._parse_annot(a, "bed")
    assert df.equals(a.bed)
    df = genomepy.annotation.utils._parse_annot(a, "gtf")
    assert df.equals(a.gtf)
    df = genomepy.annotation.utils._parse_annot(a, a.bed)
    assert df.equals(a.bed)

    with pytest.raises(ValueError):
        genomepy.annotation.utils._parse_annot(a, 1)
Exemple #7
0
def test_map_locations():
    a = genomepy.Annotation("sacCer3", genomes_dir="tests/data")

    # BED + Ensembl
    ens = a.map_locations(annot=a.bed.head(1), to="Ensembl")
    assert ens.chrom.to_list() == ["IV"]

    # GTF + NCBI
    ncbi = a.map_locations(annot=a.gtf.head(1), to="NCBI")
    assert ncbi.seqname.to_list() == ["IV"]

    # custom dataframe (already indexed)
    df = a.bed.head(1).set_index("start")
    cus = a.map_locations(annot=df, to="Ensembl")
    assert ens.chrom.to_list() == ["IV"]
    assert cus.index.name == "start"
Exemple #8
0
def test_gene_coords(caplog):
    a = genomepy.Annotation("sacCer3", genomes_dir="tests/data")

    bed_genes = a.genes("bed")[0:10]
    c = a.gene_coords(bed_genes, "bed")
    assert list(c.shape) == [10, 5]
    assert c.columns.to_list() == ["chrom", "start", "end", "name", "strand"]

    gtf_genes = a.genes("gtf")[0:10]
    c = a.gene_coords(gtf_genes, "gtf")
    assert list(c.shape) == [10, 5]
    assert c.columns.to_list() == ["seqname", "start", "end", "gene_name", "strand"]

    # mismatched gene names!
    _ = a.gene_coords(["what?"], "bed")
    assert "No genes found." in caplog.text

    _ = a.gene_coords(["YDL200C_mRNA", "what?"], "bed")
    assert "Only 50% of genes was found." in caplog.text
Exemple #9
0
def test_map_genes():
    a = genomepy.Annotation("GRCz11", genomes_dir="tests/data")

    bed = a.bed.head()
    transcript_ids = bed.name.to_list()
    assert transcript_ids[0] == "ENSDART00000159919"

    # transcript to gene
    res = a.map_genes(field="ensembl.gene", annot=bed)
    genes = res.name.to_list()
    assert genes[0] == "ENSDARG00000103202"

    # # transcript to symbol
    # res = a.map_genes(field="symbol", df=bed)
    # symbol = res.name.to_list()
    # assert symbol[0] == "CR383668.1"

    # refseq hits & subtypes
    protein = a.map_genes(field="refseq", product="protein", annot=bed)
    assert protein.name.to_list()[0].startswith("NP_")
Exemple #10
0
def test_sanitize():
    a = genomepy.Annotation("sanitize", genomes_dir="tests/data")

    a.sanitize(overwrite=False)
    assert a.gtf.shape == (4, 9)
    assert list(set(a.gtf.seqname)) == ["1"]
Exemple #11
0
def test_genes():
    a = genomepy.Annotation("GRCz11", genomes_dir="tests/data")
    g = a.genes("bed")
    assert isinstance(g, list)
    g = a.genes("gtf")
    assert isinstance(g, list)
Exemple #12
0
ax.set_xlabel("contig number (ordered by size)")
ax.set_ylabel("contig size")
ax.set_ylim((1, 15 * max(sizes)))
plt.xscale("log")
plt.yscale("log")

# now save the image as html text
img = io.BytesIO()
fig.savefig(img, format='png')
img.seek(0)
html = '<img src="data:image/png;base64, {}">'.format(
    base64.b64encode(img.getvalue()).decode('utf-8'))

# if we have an annotation check for the number of genes present
if hasattr(snakemake.input, "annotation"):
    gp = genomepy.Annotation(assembly,
                             genomes_dir=snakemake.params.genomes_dir)
    nr_genes = len(gp.genes("gtf"))
    annotation_text = f"""The genome annotation contains {nr_genes} genes."""
else:
    annotation_text = ""

# save it all
with open(outfile, "a") as f:
    f.write(f"""
<!--
id: 'assembly_stats'
section_name: 'Assembly stats'
-->
Genome assembly {assembly} contains of {len(sizes)} contigs, with a GC-content \
of {gc / (gc + at) * 100:.2f}%, and {(total_size - gc - at) / total_size * 100:.2f}%\
 consists of the letter N. The <a href="https://en.wikipedia.org/wiki/N50,_L50,_and_r\