def test_subset_genes_should_returns_original_if_no_additional_fields_or_valid_biotypes( self, ): fasta_name = os.path.join(self.outdir, "ci.fa.gz") gtf_name = os.path.join(self.outdir, "ci.gtf.gz") conversion_name = os.path.join(self.outdir, "ci_ids.csv") idx = index.Index("ciona_intestinalis", index_folder_name=self.outdir) idx._download_ensembl_files( ensemble_release=None, fasta_name=fasta_name, gtf_name=gtf_name, conversion_name=conversion_name, ) truncated_gtf = os.path.join(self.outdir, "test.gtf") idx._subset_genes(conversion_name, gtf_name, truncated_gtf, valid_biotypes=None) # expect the same file as the original file self.assertTrue(os.path.isfile(truncated_gtf)) # the current implementation of GTF Reader doesn't allow this: # for gr1, gr2 in zip(gtf.Reader(gtf_name), gtf.Reader(truncated_gtf)): records = [] for gr in gtf.Reader(gtf_name): records.append(gtf.Record(gr)) for i, gr in enumerate(gtf.Reader(truncated_gtf)): rec1 = records[i] rec2 = gtf.Record(gr) self.assertEqual(rec1, rec2)
def test_subset_genes_produces_a_reduced_annotation_file_when_passed_fields(self): organism = 'ciona_intestinalis' idx = index.Index(organism, ['entrezgene']) os.chdir(self.outdir) idx._download_ensembl_files() self.assertTrue(os.path.isfile('%s.fa.gz' % organism), 'fasta file not found') self.assertTrue(os.path.isfile('%s.gtf.gz' % organism), 'gtf file not found') self.assertTrue(os.path.isfile('%s_ids.csv' % organism), 'id file not found') idx._subset_genes() self.assertTrue(os.path.isfile('%s_multiconsortia.gtf' % organism)) gr_subset = gtf.Reader('%s_multiconsortia.gtf' % organism) gr_complete = gtf.Reader('%s.gtf.gz' % organism) self.assertLess( len(gr_subset), len(gr_complete), 'Subset annotation was not smaller than the complete annotation') # make sure only valid biotypes are returned complete_invalid = False valid_biotypes = {b'protein_coding', b'lincRNA'} for r in gr_complete.iter_genes(): if r.attribute(b'gene_biotype') not in valid_biotypes: complete_invalid = True break self.assertTrue(complete_invalid) subset_invalid = False for r in gr_subset.iter_genes(): if r.attribute(b'gene_biotype') not in valid_biotypes: subset_invalid = True break self.assertFalse(subset_invalid) self.assertGreater(len(gr_subset), 0)
def test_subset_genes_produces_a_reduced_annotation_file_when_passed_fields( self): organism = "ciona_intestinalis" idx = index.Index(organism, ["external_gene_name"], index_folder_name=self.outdir) idx._download_ensembl_files(ensemble_release=None) self.assertTrue( os.path.isfile(os.path.join(self.outdir, "%s.fa.gz" % organism)), "fasta file not found", ) self.assertTrue( os.path.isfile(os.path.join(self.outdir, "%s.gtf.gz" % organism)), "gtf file not found", ) self.assertTrue( os.path.isfile(os.path.join(self.outdir, "%s_ids.csv" % organism)), "id file not found", ) valid_biotypes = {"protein_coding", "lincRNA"} idx._subset_genes(valid_biotypes=valid_biotypes) self.assertTrue( os.path.isfile( os.path.join(self.outdir, organism, "annotations.gtf"))) gr_subset = gtf.Reader( os.path.join(self.outdir, organism, "annotations.gtf")) gr_complete = gtf.Reader( os.path.join(self.outdir, "%s.gtf.gz" % organism)) self.assertLess( len(gr_subset), len(gr_complete), "Subset annotation was not smaller than the complete annotation", ) # make sure only valid biotypes are returned complete_invalid = False for r in gr_complete: record = gtf.Record(r) if record.attribute("gene_biotype") not in valid_biotypes: complete_invalid = True break self.assertTrue(complete_invalid) subset_invalid = False for r in gr_subset: record = gtf.Record(r) if record.attribute("gene_biotype") not in valid_biotypes: subset_invalid = True break self.assertFalse(subset_invalid) self.assertGreater(len(gr_subset), 0)
def test_download_gtf_file_gets_a_file_readable_by_seqc_gtf_reader(self): idx = index.Index('ciona_intestinalis', ['entrezgene']) with ftplib.FTP(host='ftp.ensembl.org') as ftp: ftp.login() filename = self.outdir + 'ci.gtf.gz' idx._download_gtf_file(ftp, filename) rd = gtf.Reader(filename) rc = next(rd.iter_genes()) self.assertIsInstance(rc, gtf.Gene) os.remove(filename)
def test_download_gtf_file_gets_a_file_readable_by_seqc_gtf_reader(self): idx = index.Index("ciona_intestinalis", ["entrezgene"]) with ftplib.FTP(host="ftp.ensembl.org") as ftp: ftp.login() filename = self.outdir + "ci.gtf.gz" idx._download_gtf_file(ftp, filename, ensemble_release=99) rd = gtf.Reader(filename) (transcript_chromosome, transcript_strand, transcript_gene_id), exons = next(rd.iter_transcripts()) # (('1', '+', 17842), [['1', 'ensembl', 'exon', '1636', '1902', '.', '+', '.', 'gene_id "ENSCING00000017842"; gene_version "1"; transcript_id "ENSCINT00000030147"; transcript_version "1"; exon_number "1"; gene_name "RNaseP_nuc"; gene_source "ensembl"; gene_biotype "misc_RNA"; transcript_name "RNaseP_nuc-201"; transcript_source "ensembl"; transcript_biotype "misc_RNA"; exon_id "ENSCINE00000207263"; exon_version "1";\n']]) self.assertEqual(transcript_chromosome, "1") self.assertEqual(transcript_strand, "+") self.assertEqual(transcript_gene_id, 17842) self.assertEqual(len(exons), 1)
def test_iter_transcripts(self): rd = gtf.Reader(self.annotation) (transcript_chromosome, transcript_strand, transcript_gene_id), exons = next(rd.iter_transcripts()) # this should give us 3 exons of the first transcript of the first gene found in inverse order: # # chr19 HAVANA gene 60951 71626 . - . gene_id "ENSG00000282458.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; level 2; havana_gene "OTTHUMG00000180466.8"; # chr19 HAVANA transcript 60951 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; # chr19 HAVANA exon 70928 70976 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 1; exon_id "ENSE00003781173.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; # chr19 HAVANA exon 66346 66499 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 2; exon_id "ENSE00003783498.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; # chr19 HAVANA exon 60951 61894 . - . gene_id "ENSG00000282458.1"; transcript_id "ENST00000632506.1"; gene_type "transcribed_processed_pseudogene"; gene_status "KNOWN"; gene_name "WASH5P"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "WASH5P-008"; exon_number 3; exon_id "ENSE00003783010.1"; level 2; tag "basic"; transcript_support_level "1"; havana_gene "OTTHUMG00000180466.8"; havana_transcript "OTTHUMT00000471217.2"; self.assertEqual(transcript_chromosome, "chr19") self.assertEqual(transcript_strand, "-") self.assertEqual(transcript_gene_id, 282458) self.assertEqual(len(exons), 3) # 8th column has exon ID self.assertIn("ENSE00003783010.1", exons[0][8]) # exon number 3 self.assertIn("ENSE00003783498.1", exons[1][8]) # exon number 2 self.assertIn("ENSE00003781173.1", exons[2][8]) # exon number 1
def get_length_of_gtf(self): rd = gtf.Reader(self.annotation) # print(len(rd)) print(sum(1 for _ in rd.iter_transcripts()))
def test_num_of_transcripts(self): rd = gtf.Reader(self.annotation) num_transcripts = sum(1 for _ in rd.iter_transcripts()) # awk -F'\t' '$3=="transcript" { print $0 }' annotations.gtf | wc -l self.assertEqual(num_transcripts, 12747)
def _subset_genes( self, conversion_file: str = None, gtf_file: str = None, truncated_annotation: str = None, valid_biotypes=("protein_coding", "lincRNA"), ): """ Remove any annotation from the annotation_file that is not also defined by at least one additional identifer present in conversion file. The effect of these fields is to limit the ENSEMBL genes to a subset of genes which are also defined by other consortia. The rationale for requiring multiple definitions is that ENSEMBL has very relaxed standards, with many of its genes being defined based on predicted locations without any biological evidence, or, more importantly, any associated biological information. These such genes are often uninformative as a result, and are better excluded from the index. valid_biotypes removes genes that are of biotypes that single-cell sequencing is unlikely to detect. For example, miRNA are rarely poly-adenylated, and are of a size that they are often removed with primers. In our experience, the only biotypes that are worth considering are protein coding genes and lincRNA, the defaults for this function. :param conversion_file: file location of the conversion file :param gtf_file: file location of the annotation file :param truncated_annotation: name for the generated output file :param list(str) valid_biotypes: only accept genes of this biotype. """ if gtf_file is None: gtf_file = os.path.join(self.index_folder_name, "{}.gtf.gz".format(self.organism)) if conversion_file is None: conversion_file = os.path.join(self.index_folder_name, "{}_ids.csv".format(self.organism)) if truncated_annotation is None: truncated_annotation = os.path.join(self.index_folder_name, self.organism, "annotations.gtf") if not (self.additional_id_types or valid_biotypes): # nothing to be done # no need to truncate the annotation file # let's just make a copy of the original file so that it can be added to the final output directory cmd = "gunzip -c {} > {}".format(gtf_file, truncated_annotation) err = check_call(cmd, shell=True) if err: raise ChildProcessError("conversion file download failed: %s" % err) return # change to set for efficiency valid_biotypes = set(valid_biotypes) # extract valid ensembl ids from the conversion file c = pd.read_csv(conversion_file, index_col=[0]) if c.shape[1] == 1: # index == ensembl_gene_id & col 1 == hgnc_symbol valid_ensembl_ids = set(c[np.any(~c.isnull().values, axis=1)].index) elif c.shape[1] == 0: # index == ensembl_gene_id & no columns # set to none to take all IDs valid_ensembl_ids = None else: raise Exception("Not implemented/supported shape={}".format( c.shape)) # remove any invalid ids from the annotation file gr = gtf.Reader(gtf_file) with open(truncated_annotation, "wt") as f: for line_fields in gr: record = gtf.Record(line_fields) # include only biotypes of interest if record.attribute("gene_biotype") in valid_biotypes: if (valid_ensembl_ids is None) or ( record.attribute("gene_id") in valid_ensembl_ids): f.write("\t".join(line_fields))
def _subset_genes(self, conversion_file: str = None, gtf_file: str = None, truncated_annotation: str = None, valid_biotypes=(b'protein_coding', b'lincRNA')): """ Remove any annotation from the annotation_file that is not also defined by at least one additional identifer present in conversion file. The effect of these fields is to limit the ENSEMBL genes to a subset of genes which are also defined by other consortia. The rationale for requiring multiple definitions is that ENSEMBL has very relaxed standards, with many of its genes being defined based on predicted locations without any biological evidence, or, more importantly, any associated biological information. These such genes are often uninformative as a result, and are better excluded from the index. valid_biotypes removes genes that are of biotypes that single-cell sequencing is unlikely to detect. For example, miRNA are rarely poly-adenylated, and are of a size that they are often removed with primers. In our experience, the only biotypes that are worth considering are protein coding genes and lincRNA, the defaults for this function. :param conversion_file: file location of the conversion file :param gtf_file: file location of the annotation file :param truncated_annotation: name for the generated output file :param list(bytes) valid_biotypes: only accept genes of this biotype. """ if not (self.additional_id_types or valid_biotypes): # nothing to be done return # change to set for efficiency if all(isinstance(t, str) for t in valid_biotypes): valid_biotypes = set((t.encode() for t in valid_biotypes)) elif all(isinstance(t, bytes) for t in valid_biotypes): valid_biotypes = set(valid_biotypes) else: raise TypeError( 'mixed-type biotypes detected. Please pass valid_biotypes ' 'as strings or bytes objects (but not both).') if gtf_file is None: gtf_file = '%s/%s.gtf.gz' % (self.index_folder_name, self.organism) if conversion_file is None: conversion_file = '%s/%s_ids.csv' % (self.index_folder_name, self.organism) if truncated_annotation is None: truncated_annotation = '%s/%s_multiconsortia.gtf' % ( self.index_folder_name, self.organism) # extract valid ensembl ids from the conversion file c = pd.read_csv(conversion_file, index_col=[0]) valid_ensembl_ids = set(c[np.any(~c.isnull().values, axis=1)].index) # remove any invalid ids from the annotation file gr = gtf.Reader(gtf_file) with open(truncated_annotation, 'wb') as f: for line_fields in gr: record = gtf.Record(line_fields) if (record.attribute(b'gene_id').decode() in valid_ensembl_ids and record.attribute(b'gene_biotype') in valid_biotypes): f.write(bytes(record))