def test_annotation_init(caplog, annot): assert annot.genome_file.endswith("data/regexp/regexp.fa") assert annot.annotation_gtf_file.endswith("data/regexp/regexp.annotation.gtf") assert annot.annotation_bed_file.endswith("data/regexp/regexp.annotation.bed") assert annot.annotation_contigs == ["chrM"] # from BED assert len(annot.genome_contigs) == 17 # from sizes assert isinstance(annot.bed, pd.DataFrame) assert isinstance(annot.gtf, pd.DataFrame) assert annot.genes("bed") == ["NP_059343.1"] # >1 GTF files: take unzipped g1 = "tests/data/data.annotation.gtf" with genomepy.files._open(g1, "w") as fa: fa.write("1") g2 = "tests/data/data.annotation.gtf.gz" with genomepy.files._open(g2, "w") as fa: fa.write("2") a = genomepy.Annotation("data", genomes_dir="tests") assert a.annotation_gtf_file.endswith(g1) genomepy.utils.rm_rf(g1) genomepy.utils.rm_rf(g2) # not enough GTF files genomepy.Annotation("empty", genomes_dir="tests/data") assert "Could not find 'empty.annotation.bed" in caplog.text assert "Could not find 'empty.annotation.gtf" in caplog.text # Genome doesn't exist with pytest.raises(FileNotFoundError): genomepy.Annotation("never_existed", genomes_dir="tests/data")
def test_named_gtf(): a = genomepy.Annotation("sacCer3", genomes_dir="tests/data") df = a.named_gtf assert df.index.name == "gene_name" assert str(a.gtf.index.dtype) == "int64" assert str(df.index.dtype) == "object" assert set(df.at["YDL248W", "seqname"]) == {"chrIV"}
def test_match_contigs(): # nothing to work with a = genomepy.Annotation("sacCer3", genomes_dir="tests/data") cd = genomepy.annotation.sanitize._match_contigs(a) assert cd is None # one missing contig, one fixable contig a = genomepy.Annotation("sanitize", genomes_dir="tests/data") before = a.gtf cd = genomepy.annotation.sanitize._match_contigs(a) after = a.gtf assert cd == {"NC_007112.7": "1"} assert a.genome_contigs == ["1"] assert list(before.seqname.unique()) == ["NC_007112.7", "NC_002333.2"] assert list(after.seqname.unique()) == ["1", "NC_002333.2"]
def test_custom_annotation(): for fname in [ "tests/data/custom.annotation.bed", "tests/data/custom.annotation.bed.gz", ]: a = genomepy.Annotation(name=fname, genomes_dir="tests/data") assert a.bed.shape[0] == 10 for fname in [ "tests/data/custom.annotation.gtf", "tests/data/custom.annotation.gtf.gz", ]: a = genomepy.Annotation(name=fname, genomes_dir="tests/data") assert a.gtf.shape[0] == 45 with pytest.raises(NotImplementedError): a = genomepy.Annotation(name="tests/data/regions.txt", genomes_dir="tests/data")
def test_filter_contigs(): a = genomepy.Annotation("sacCer3", genomes_dir="tests/data") assert "chrV" not in a.bed.chrom.unique() # add a chromosome to the BED not present in the genome a.bed.at[0, "chrom"] = "chrV" assert "chrV" in a.bed.chrom.unique() missing_contigs = genomepy.annotation.sanitize._filter_contigs(a) assert missing_contigs == {"chrV"} assert "chrV" not in a.bed.chrom.unique()
def test__parse_annot(): a = genomepy.Annotation("sacCer3", genomes_dir="tests/data") df = genomepy.annotation.utils._parse_annot(a, "bed") assert df.equals(a.bed) df = genomepy.annotation.utils._parse_annot(a, "gtf") assert df.equals(a.gtf) df = genomepy.annotation.utils._parse_annot(a, a.bed) assert df.equals(a.bed) with pytest.raises(ValueError): genomepy.annotation.utils._parse_annot(a, 1)
def test_map_locations(): a = genomepy.Annotation("sacCer3", genomes_dir="tests/data") # BED + Ensembl ens = a.map_locations(annot=a.bed.head(1), to="Ensembl") assert ens.chrom.to_list() == ["IV"] # GTF + NCBI ncbi = a.map_locations(annot=a.gtf.head(1), to="NCBI") assert ncbi.seqname.to_list() == ["IV"] # custom dataframe (already indexed) df = a.bed.head(1).set_index("start") cus = a.map_locations(annot=df, to="Ensembl") assert ens.chrom.to_list() == ["IV"] assert cus.index.name == "start"
def test_gene_coords(caplog): a = genomepy.Annotation("sacCer3", genomes_dir="tests/data") bed_genes = a.genes("bed")[0:10] c = a.gene_coords(bed_genes, "bed") assert list(c.shape) == [10, 5] assert c.columns.to_list() == ["chrom", "start", "end", "name", "strand"] gtf_genes = a.genes("gtf")[0:10] c = a.gene_coords(gtf_genes, "gtf") assert list(c.shape) == [10, 5] assert c.columns.to_list() == ["seqname", "start", "end", "gene_name", "strand"] # mismatched gene names! _ = a.gene_coords(["what?"], "bed") assert "No genes found." in caplog.text _ = a.gene_coords(["YDL200C_mRNA", "what?"], "bed") assert "Only 50% of genes was found." in caplog.text
def test_map_genes(): a = genomepy.Annotation("GRCz11", genomes_dir="tests/data") bed = a.bed.head() transcript_ids = bed.name.to_list() assert transcript_ids[0] == "ENSDART00000159919" # transcript to gene res = a.map_genes(field="ensembl.gene", annot=bed) genes = res.name.to_list() assert genes[0] == "ENSDARG00000103202" # # transcript to symbol # res = a.map_genes(field="symbol", df=bed) # symbol = res.name.to_list() # assert symbol[0] == "CR383668.1" # refseq hits & subtypes protein = a.map_genes(field="refseq", product="protein", annot=bed) assert protein.name.to_list()[0].startswith("NP_")
def test_sanitize(): a = genomepy.Annotation("sanitize", genomes_dir="tests/data") a.sanitize(overwrite=False) assert a.gtf.shape == (4, 9) assert list(set(a.gtf.seqname)) == ["1"]
def test_genes(): a = genomepy.Annotation("GRCz11", genomes_dir="tests/data") g = a.genes("bed") assert isinstance(g, list) g = a.genes("gtf") assert isinstance(g, list)
ax.set_xlabel("contig number (ordered by size)") ax.set_ylabel("contig size") ax.set_ylim((1, 15 * max(sizes))) plt.xscale("log") plt.yscale("log") # now save the image as html text img = io.BytesIO() fig.savefig(img, format='png') img.seek(0) html = '<img src="data:image/png;base64, {}">'.format( base64.b64encode(img.getvalue()).decode('utf-8')) # if we have an annotation check for the number of genes present if hasattr(snakemake.input, "annotation"): gp = genomepy.Annotation(assembly, genomes_dir=snakemake.params.genomes_dir) nr_genes = len(gp.genes("gtf")) annotation_text = f"""The genome annotation contains {nr_genes} genes.""" else: annotation_text = "" # save it all with open(outfile, "a") as f: f.write(f""" <!-- id: 'assembly_stats' section_name: 'Assembly stats' --> Genome assembly {assembly} contains of {len(sizes)} contigs, with a GC-content \ of {gc / (gc + at) * 100:.2f}%, and {(total_size - gc - at) / total_size * 100:.2f}%\ consists of the letter N. The <a href="https://en.wikipedia.org/wiki/N50,_L50,_and_r\