def main(): args = get_args() # n_processes = args.n_processes eventalign_filepath = args.eventalign summary_filepath = args.summary chunk_size = args.chunk_size out_dir = args.out_dir ensembl_version = args.ensembl ensembl_species = args.species readcount_min = args.readcount_min readcount_max = args.readcount_max resume = args.resume genome = args.genome customised_genome = args.customised_genome if customised_genome and (None in [ args.reference_name, args.annotation_name, args.gtf_path_or_url, args.transcript_fasta_paths_or_urls ]): print( 'If you have your own customised genome not in Ensembl, please provide the following' ) print('- reference_name') print('- annotation_name') print('- gtf_path_or_url') print('- transcript_fasta_paths_or_urls') else: reference_name = args.reference_name annotation_name = args.annotation_name gtf_path_or_url = args.gtf_path_or_url transcript_fasta_paths_or_urls = args.transcript_fasta_paths_or_urls misc.makedirs(out_dir) #todo: check every level. # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position. if not args.skip_eventalign_indexing: parallel_index(eventalign_filepath, summary_filepath, chunk_size, out_dir, n_processes, resume) # (2) Create a .json file, where the info of all reads are stored per position, for modelling. if genome: if customised_genome: db = Genome( reference_name=reference_name, annotation_name=annotation_name, gtf_path_or_url=gtf_path_or_url, transcript_fasta_paths_or_urls=transcript_fasta_paths_or_urls) # parse GTF and construct database of genomic features db.index() else: db = EnsemblRelease( ensembl_version, ensembl_species ) # Default: human reference genome GRCh38 release 91 used in the ont mapping. parallel_preprocess_gene(eventalign_filepath, db, out_dir, n_processes, readcount_min, readcount_max, resume) else: parallel_preprocess_tx(eventalign_filepath, out_dir, n_processes, readcount_min, readcount_max, resume)
def setup_create_genome(): global mouse_genome mouse_genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH, protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH) mouse_genome.clear_cache() mouse_genome.index()
def main(): bed = sys.argv[1] gtf = "Homo_sapiens.GRCh38.90.gtf" hg38 = Genome(reference_name='GRCh38', annotation_name='my_genome_features', gtf_path_or_url=gtf) hg38.index() with open(bed) as f: for line in f: chromosome, start, end, *left = line.strip().split() r = Region(chromosome, start, end) r.get_symbol(hg38) print(line.strip(), r.gene)
def test_ucsc_gencode_genome(): """ Testing with a small GENCODE GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome( reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_GENCODE_PATH, cache_directory_path=tmpdir, ) genome.index() genes = genome.genes() for gene in genes: assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (len(transcripts), transcripts) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") eq_(gene_uc001aak4.name, None) eq_(gene_uc001aak4.biotype, None) gene_1_17369 = genome.genes_at_locus(1, 17369) eq_(gene_1_17369[0].id, "uc031tla.1") transcript_1_30564 = genome.transcripts_at_locus(1, 30564) eq_(transcript_1_30564[0].id, "uc057aty.1")
def test_mouse_ENSMUSG00000017167(): """ GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/ Mus_musculus.GRCm38.81.gtf.gz via: grep "ENSMUSG00000017167" Mus_musculus.GRCm38.81.gtf Transcript FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/ fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz via: grep "ENSMUSG00000017167" Mus_musculus.GRCm38.cdna.all.fa -A 50 Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/ mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via: grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50 Tested against: http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167 """ genome = Genome( reference_name="GRCm38", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH, protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH, ) genome.install() genes_cntnap1 = genome.genes_by_name("Cntnap1") eq_(len(genes_cntnap1), 1) gene_cntnap1 = genes_cntnap1[0] transcripts_cntnap1 = gene_cntnap1.transcripts eq_(len(transcripts_cntnap1), 2) transcripts_coding_cntnap1 = [ transcript for transcript in transcripts_cntnap1 if transcript.biotype == "protein_coding" ] eq_(len(transcripts_coding_cntnap1), 1) transcript_cntnap1 = transcripts_coding_cntnap1[0] eq_( transcript_cntnap1.sequence[:120], ( "GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" "GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT" ), ) eq_( transcript_cntnap1.protein_sequence[:120], ( "MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS" "GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH" ), )
def test_gtf_transcript_only(): genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH) genome.index() eq_(1, len(genome.genes())) transcript = genome.transcripts()[0] ok_(transcript.sequence) with assert_raises(ValueError) as cm: transcript.protein_sequence no_protein_(cm)
def test_gtf_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH) genome.index() eq_(1, len(genome.genes())) with assert_raises(ValueError) as cm: genome.transcript_sequence("test") no_transcript_(cm) with assert_raises(ValueError) as cm: genome.protein_sequence("test") no_protein_(cm)
def test_ucsc_refseq_genome(): """ Test Genome object with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome(reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_REFSEQ_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 2, \ "Expected 2 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 2, \ "Expected 2 transcripts, got %d: %s" % ( len(transcripts), transcripts) genes_at_locus = genome.genes_at_locus(1, 67092176) assert len(genes_at_locus) == 2, \ "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( len(genes_at_locus), genes_at_locus) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids)
def get_gene_ids_of_transcript_id(transcript_id: str, ensembl: pyensembl.Genome, raise_on_error: bool = False): """ Extract all gene ids associated with the given transcript. Parameters ---------- transcript_id: string The transcript identifier ensembl: pyensembl.Genome The annotations raise_on_error: bool Whether to raise an exception if the transcript id is not found in the annotations database Returns ------- transcript_gene_id_df: pd.DataFrame A dataframe with columns to map between transcripts and genes. Its columns are: transcript_id gene_id """ try: gene_name = ensembl.gene_name_of_transcript_id(transcript_id) gene_ids = ensembl.gene_ids_of_gene_name(gene_name) except ValueError as ve: msg = ("['pyensembl_utils.get_gene_ids_of_transcript_id]: could not " "find transcript id in database: {}".format(transcript_id)) if raise_on_error: raise ValueError(msg) from ve else: logger.warning(msg) return None ret = [{'transcript_id': transcript_id, 'gene_id': g} for g in gene_ids] return ret
def test_ucsc_refseq_genome(): """ Test Genome object with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome( reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_REFSEQ_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 2, \ "Expected 2 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 2, \ "Expected 2 transcripts, got %d: %s" % ( len(transcripts), transcripts) genes_at_locus = genome.genes_at_locus(1, 67092176) assert len(genes_at_locus) == 2, \ "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( len(genes_at_locus), genes_at_locus) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_gtf_only(): genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH) genome.index() eq_(1, len(genome.genes())) with assert_raises(ValueError) as cm: genome.transcript_sequence("test") no_transcript_(cm) with assert_raises(ValueError) as cm: genome.protein_sequence("test") no_protein_(cm)
def test_protein_fasta_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", protein_fasta_paths_or_urls=[ MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH ]) genome.index() eq_(4, len(genome.protein_sequences.fasta_dictionary)) with assert_raises(ValueError) as cm: genome.genes() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.transcript_sequence("test") no_transcript_(cm)
def annotate_xcnv(): with open(r'Z:\gatk\data\DATA.xcnv','r') as f1: DATA_xcnv = list(csv.reader(f1, delimiter='\t')) data = Genome( reference_name='GRCh37', annotation_name='GRCh37', gtf_path_or_url=r'Z:\gatk\reference\Homo_sapiens.GRCh37.75.gtf') data.index() with open(r'./VPv1_exons_positive','r') as f2: exons_positive = list(csv.reader(f2, delimiter='\t')) with open(r'./VPv1_exons_negative_flipped','r') as f3: exons_negative = list(csv.reader(f3, delimiter='\t')) for index, row in enumerate(DATA_xcnv[1:]): chr = str(row[4]) chr_start = int(re.split(':|-', row[2])[1])+1 CNV_start = data.gene_names_at_locus(contig=chr, position=int(chr_start)) try: chr_end = int(re.split(':|-', row[2])[2]) except IndexError: chr_end = chr_start CNV_end = data.gene_names_at_locus(contig=chr, position=chr_end) for row in exons_positive: if chr == row[0][3:]: if chr_start >= int(row[1]) and chr_start <= int(row[2]): match = row[3].split('_') CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1)) if chr_end >= int(row[1]) and chr_end <= int(row[2]): match = row[3].split('_') CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1)) for row in exons_negative: if chr == row[0][3:]: if chr_start >= int(row[1]) and chr_start <= int(row[2]): match = row[3].split('_') CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3]) if chr_end >= int(row[1]) and chr_end <= int(row[2]): match = row[3].split('_') CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3]) DATA_xcnv[index+1].append(CNV_start) DATA_xcnv[index+1].append(CNV_end) DATA_xcnv[0] += ['CNV_START', 'CNV_END'] with open(r'Z:\gatk\data\DATA_annotated.xcnv','w',newline='') as fo: writer = csv.writer(fo, delimiter='\t') for row in DATA_xcnv: writer.writerow(row) return
def test_protein_fasta_only(): genome_only_proteins = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH]) genome_only_proteins.index() eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary)) with assert_raises(ValueError) as cm: genome_only_proteins.genes() no_gtf_(cm) with assert_raises(ValueError) as cm: genome_only_proteins.transcript_sequence("DOES_NOT_EXIST") no_transcript_(cm)
def test_ucsc_gencode_genome(): """ Testing with a small GENCODE GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome( reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_GENCODE_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 7, \ "Expected 7 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 7, \ "Expected 7 transcripts, got %d: %s" % ( len(transcripts), transcripts) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") eq_(gene_uc001aak4.name, None) eq_(gene_uc001aak4.biotype, None) gene_1_17369 = genome.genes_at_locus("chr1", 17369) eq_(gene_1_17369[0].id, "uc031tla.1") transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564) eq_(transcript_1_30564[0].id, "uc057aty.1")
def test_ucsc_gencode(): """ Testing with a small GENCODE GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ genome = Genome("GRCh38", gtf_path_or_url=UCSC_GENCODE_PATH) genome.install() eq_(len(genome.genes()), 7) eq_(len(genome.transcripts()), 7) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") eq_(gene_uc001aak4.name, None) eq_(gene_uc001aak4.biotype, None) gene_1_17369 = genome.genes_at_locus(1, 17369) eq_(gene_1_17369[0].id, "uc031tla.1") transcript_1_30564 = genome.transcripts_at_locus(1, 30564) eq_(transcript_1_30564[0].id, "uc057aty.1")
def get_gene_name_of_transcript_id(transcript_id: str, ensembl: pyensembl.Genome, raise_on_error: bool = False): """ Extract the gene name (symbol) for this transcript id. The difference between this function and gene_name_of_transcript_id is that this function will (optionally) issue a warning rather than raise an exception for transcript ids not in the database. Parameters ---------- transcript_id: string The transcript identifier (e.g., "ENSMUST00000035194") ensembl: pyensembl.Genome The annotation database raise_on_error: bool Whether to issue a warning (False) or raise a ValueError (True) if the transcript identifier is not in the annotation database Returns ------- gene_name: string The gene name (also called gene symbol, e.g., "Mapkapk3") --- OR --- None, if the transcript id is not in the database of annotations """ gene_name = None try: gene_name = ensembl.gene_name_of_transcript_id(transcript_id) except ValueError as ve: msg = ("[pyensembl_utils.get_gene_name_of_transcript_id]: could not " "find match for transcript id: {}".format(transcript_id)) if raise_on_error: raise ValueError(msg) from ve else: logger.warning(msg) return gene_name
def test_ucsc_refseq(): """ Testing with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ genome = Genome("GRCh38", gtf_path_or_url=UCSC_REFSEQ_PATH) genome.install() eq_(len(genome.genes()), 2) eq_(len(genome.transcripts()), 2) genes_at_locus = genome.genes_at_locus(1, 67092176) eq_(len(genes_at_locus), 2) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_gtf_transcript_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url= MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH) genome.index() eq_(1, len(genome.genes())) transcript = genome.transcripts()[0] ok_(transcript.sequence) with assert_raises(ValueError) as cm: transcript.protein_sequence no_protein_(cm)
SERVER = "ftp://ftp.ensembl.org" MOUSE_GTF_PATH = \ SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % ( MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE) MOUSE_TRANSCRIPT_FASTA_PATH = \ SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz" MOUSE_PROTEIN_FASTA_PATH = \ SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % ( MOUSE_ENSEMBL_RELEASE) MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf") explicit_url_genome = Genome( reference_name="GRCm38", annotation_name="ensembl", annotation_version=MOUSE_ENSEMBL_RELEASE, gtf_path_or_url=MOUSE_GTF_PATH, transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH], protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH]) ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse") def test_load_vcf_mouse_with_explicit_urls(): variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome) eq_(len(variants), 217) def test_load_vcf_mouse_with_ensembl_release(): variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome) eq_(len(variants), 217)
# Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/ # mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via: # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50 # Tested against: # http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167 MOUSE_ENSMUSG00000017167_PATH = data_path( "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf") MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path( "mouse.ensembl.81.partial.ENSMUSG00000017167.fa") MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( "mouse.ensembl.81.partial.ENSMUSG00000017167.pep") custom_mouse_genome_grcm38_subset = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH, protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH) def setup_init_custom_mouse_genome(): """ If a unit test needs to start from a cleared cache, add this to the test setup. """ custom_mouse_genome_grcm38_subset.clear_cache() custom_mouse_genome_grcm38_subset.index()
def test_custom_genome_to_dict(): reconstructed = Genome.from_dict( custom_mouse_genome_grcm38_subset.to_dict()) eq_(custom_mouse_genome_grcm38_subset, reconstructed)
def load_genome(file): global data data = Genome(reference_name='GRCh38', annotation_name='ENSEMBL', gtf_path_or_url=file) data.index()
def test_custom_genome_to_dict(): reconstructed = Genome.from_dict(custom_mouse_genome_grcm38_subset.to_dict()) eq_(custom_mouse_genome_grcm38_subset, reconstructed)
def test_custom_genome_to_json(): json = custom_mouse_genome_grcm38_subset.to_json() reconstructed = Genome.from_json(json) eq_(custom_mouse_genome_grcm38_subset, reconstructed)
def test_transcript_fasta_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", transcript_fasta_path_or_url= MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH) genome.index() eq_(2, len(genome.transcript_sequences.fasta_dictionary)) with assert_raises(ValueError) as cm: genome.genes() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.gene_ids() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.gene_ids_of_gene_name("test") no_gtf_(cm) with assert_raises(ValueError) as cm: genome.transcript_names() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.protein_sequence("test") no_protein_(cm)
def test_transcript_fasta_only(): genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH) genome.index() eq_(2, len(genome.transcript_sequences.fasta_dictionary)) with assert_raises(ValueError) as cm: genome.genes() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.gene_ids() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.gene_ids_of_gene_name("test") no_gtf_(cm) with assert_raises(ValueError) as cm: genome.transcript_names() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.protein_sequence("test") no_protein_(cm)