def test_ucsc_refseq_genome(): """ Test Genome object with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome(reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_REFSEQ_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 2, \ "Expected 2 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 2, \ "Expected 2 transcripts, got %d: %s" % ( len(transcripts), transcripts) genes_at_locus = genome.genes_at_locus(1, 67092176) assert len(genes_at_locus) == 2, \ "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( len(genes_at_locus), genes_at_locus) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_transcript_fasta_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", transcript_fasta_paths_or_urls=[ MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH ]) genome.index() eq_(2, len(genome.transcript_sequences.fasta_dictionary)) with assert_raises(ValueError) as cm: genome.genes() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.gene_ids() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.gene_ids_of_gene_name("test") no_gtf_(cm) with assert_raises(ValueError) as cm: genome.transcript_names() no_gtf_(cm) with assert_raises(ValueError) as cm: genome.protein_sequence("test") no_protein_(cm)
def main(): args = get_args() # n_processes = args.n_processes eventalign_filepath = args.eventalign summary_filepath = args.summary chunk_size = args.chunk_size out_dir = args.out_dir ensembl_version = args.ensembl ensembl_species = args.species readcount_min = args.readcount_min readcount_max = args.readcount_max resume = args.resume genome = args.genome customised_genome = args.customised_genome if customised_genome and (None in [ args.reference_name, args.annotation_name, args.gtf_path_or_url, args.transcript_fasta_paths_or_urls ]): print( 'If you have your own customised genome not in Ensembl, please provide the following' ) print('- reference_name') print('- annotation_name') print('- gtf_path_or_url') print('- transcript_fasta_paths_or_urls') else: reference_name = args.reference_name annotation_name = args.annotation_name gtf_path_or_url = args.gtf_path_or_url transcript_fasta_paths_or_urls = args.transcript_fasta_paths_or_urls misc.makedirs(out_dir) #todo: check every level. # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position. if not args.skip_eventalign_indexing: parallel_index(eventalign_filepath, summary_filepath, chunk_size, out_dir, n_processes, resume) # (2) Create a .json file, where the info of all reads are stored per position, for modelling. if genome: if customised_genome: db = Genome( reference_name=reference_name, annotation_name=annotation_name, gtf_path_or_url=gtf_path_or_url, transcript_fasta_paths_or_urls=transcript_fasta_paths_or_urls) # parse GTF and construct database of genomic features db.index() else: db = EnsemblRelease( ensembl_version, ensembl_species ) # Default: human reference genome GRCh38 release 91 used in the ont mapping. parallel_preprocess_gene(eventalign_filepath, db, out_dir, n_processes, readcount_min, readcount_max, resume) else: parallel_preprocess_tx(eventalign_filepath, out_dir, n_processes, readcount_min, readcount_max, resume)
def annotate_xcnv(): with open(r'Z:\gatk\data\DATA.xcnv','r') as f1: DATA_xcnv = list(csv.reader(f1, delimiter='\t')) data = Genome( reference_name='GRCh37', annotation_name='GRCh37', gtf_path_or_url=r'Z:\gatk\reference\Homo_sapiens.GRCh37.75.gtf') data.index() with open(r'./VPv1_exons_positive','r') as f2: exons_positive = list(csv.reader(f2, delimiter='\t')) with open(r'./VPv1_exons_negative_flipped','r') as f3: exons_negative = list(csv.reader(f3, delimiter='\t')) for index, row in enumerate(DATA_xcnv[1:]): chr = str(row[4]) chr_start = int(re.split(':|-', row[2])[1])+1 CNV_start = data.gene_names_at_locus(contig=chr, position=int(chr_start)) try: chr_end = int(re.split(':|-', row[2])[2]) except IndexError: chr_end = chr_start CNV_end = data.gene_names_at_locus(contig=chr, position=chr_end) for row in exons_positive: if chr == row[0][3:]: if chr_start >= int(row[1]) and chr_start <= int(row[2]): match = row[3].split('_') CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1)) if chr_end >= int(row[1]) and chr_end <= int(row[2]): match = row[3].split('_') CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + str(int(match[3])+1)) for row in exons_negative: if chr == row[0][3:]: if chr_start >= int(row[1]) and chr_start <= int(row[2]): match = row[3].split('_') CNV_start.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3]) if chr_end >= int(row[1]) and chr_end <= int(row[2]): match = row[3].split('_') CNV_end.append(match[0] + '_' + match[1] + ' ' + match[2] + ' ' + match[3]) DATA_xcnv[index+1].append(CNV_start) DATA_xcnv[index+1].append(CNV_end) DATA_xcnv[0] += ['CNV_START', 'CNV_END'] with open(r'Z:\gatk\data\DATA_annotated.xcnv','w',newline='') as fo: writer = csv.writer(fo, delimiter='\t') for row in DATA_xcnv: writer.writerow(row) return
def main(): bed = sys.argv[1] gtf = "Homo_sapiens.GRCh38.90.gtf" hg38 = Genome(reference_name='GRCh38', annotation_name='my_genome_features', gtf_path_or_url=gtf) hg38.index() with open(bed) as f: for line in f: chromosome, start, end, *left = line.strip().split() r = Region(chromosome, start, end) r.get_symbol(hg38) print(line.strip(), r.gene)
def test_gtf_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH) genome.index() eq_(1, len(genome.genes())) with assert_raises(ValueError) as cm: genome.transcript_sequence("test") no_transcript_(cm) with assert_raises(ValueError) as cm: genome.protein_sequence("test") no_protein_(cm)
def test_gtf_transcript_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url= MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH) genome.index() eq_(1, len(genome.genes())) transcript = genome.transcripts()[0] ok_(transcript.sequence) with assert_raises(ValueError) as cm: transcript.protein_sequence no_protein_(cm)
def test_protein_fasta_only(): genome_only_proteins = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", protein_fasta_paths_or_urls=[ MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH ]) genome_only_proteins.index() eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary)) with assert_raises(ValueError) as cm: genome_only_proteins.genes() no_gtf_(cm) with assert_raises(ValueError) as cm: genome_only_proteins.transcript_sequence("DOES_NOT_EXIST") no_transcript_(cm)
def test_ucsc_gencode_genome(): """ Testing with a small GENCODE GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome( reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_GENCODE_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 7, \ "Expected 7 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 7, \ "Expected 7 transcripts, got %d: %s" % ( len(transcripts), transcripts) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") eq_(gene_uc001aak4.name, None) eq_(gene_uc001aak4.biotype, None) gene_1_17369 = genome.genes_at_locus("chr1", 17369) eq_(gene_1_17369[0].id, "uc031tla.1") transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564) eq_(transcript_1_30564[0].id, "uc057aty.1")
# Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/ # mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via: # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50 # Tested against: # http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167 MOUSE_ENSMUSG00000017167_PATH = data_path( "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf") MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path( "mouse.ensembl.81.partial.ENSMUSG00000017167.fa") MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( "mouse.ensembl.81.partial.ENSMUSG00000017167.pep") custom_mouse_genome_grcm38_subset = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH, protein_fasta_path_or_url=MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH) def setup_init_custom_mouse_genome(): """ If a unit test needs to start from a cleared cache, add this to the test setup. """ custom_mouse_genome_grcm38_subset.clear_cache() custom_mouse_genome_grcm38_subset.index()
SERVER = "ftp://ftp.ensembl.org" MOUSE_GTF_PATH = \ SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % ( MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE) MOUSE_TRANSCRIPT_FASTA_PATH = \ SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz" MOUSE_PROTEIN_FASTA_PATH = \ SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % ( MOUSE_ENSEMBL_RELEASE) MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf") explicit_url_genome = Genome( reference_name="GRCm38", annotation_name="ensembl", annotation_version=MOUSE_ENSEMBL_RELEASE, gtf_path_or_url=MOUSE_GTF_PATH, transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH], protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH]) ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse") def test_load_vcf_mouse_with_explicit_urls(): variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome) eq_(len(variants), 217) def test_load_vcf_mouse_with_ensembl_release(): variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome) eq_(len(variants), 217)
def load_genome(file): global data data = Genome(reference_name='GRCh38', annotation_name='ENSEMBL', gtf_path_or_url=file) data.index()