Exemple #1
0
    def test_subset_genes_should_returns_original_if_no_additional_fields_or_valid_biotypes(
        self, ):

        fasta_name = os.path.join(self.outdir, "ci.fa.gz")
        gtf_name = os.path.join(self.outdir, "ci.gtf.gz")
        conversion_name = os.path.join(self.outdir, "ci_ids.csv")

        idx = index.Index("ciona_intestinalis", index_folder_name=self.outdir)

        idx._download_ensembl_files(
            ensemble_release=None,
            fasta_name=fasta_name,
            gtf_name=gtf_name,
            conversion_name=conversion_name,
        )
        truncated_gtf = os.path.join(self.outdir, "test.gtf")
        idx._subset_genes(conversion_name,
                          gtf_name,
                          truncated_gtf,
                          valid_biotypes=None)

        # expect the same file as the original file
        self.assertTrue(os.path.isfile(truncated_gtf))

        # the current implementation of GTF Reader doesn't allow this:
        # for gr1, gr2 in zip(gtf.Reader(gtf_name), gtf.Reader(truncated_gtf)):

        records = []
        for gr in gtf.Reader(gtf_name):
            records.append(gtf.Record(gr))

        for i, gr in enumerate(gtf.Reader(truncated_gtf)):
            rec1 = records[i]
            rec2 = gtf.Record(gr)
            self.assertEqual(rec1, rec2)
Exemple #2
0
    def test_subset_genes_produces_a_reduced_annotation_file_when_passed_fields(self):
        organism = 'ciona_intestinalis'
        idx = index.Index(organism, ['entrezgene'])
        os.chdir(self.outdir)
        idx._download_ensembl_files()
        self.assertTrue(os.path.isfile('%s.fa.gz' % organism), 'fasta file not found')
        self.assertTrue(os.path.isfile('%s.gtf.gz' % organism), 'gtf file not found')
        self.assertTrue(os.path.isfile('%s_ids.csv' % organism), 'id file not found')

        idx._subset_genes()
        self.assertTrue(os.path.isfile('%s_multiconsortia.gtf' % organism))
        gr_subset = gtf.Reader('%s_multiconsortia.gtf' % organism)
        gr_complete = gtf.Reader('%s.gtf.gz' % organism)
        self.assertLess(
            len(gr_subset), len(gr_complete),
            'Subset annotation was not smaller than the complete annotation')

        # make sure only valid biotypes are returned
        complete_invalid = False
        valid_biotypes = {b'protein_coding', b'lincRNA'}
        for r in gr_complete.iter_genes():
            if r.attribute(b'gene_biotype') not in valid_biotypes:
                complete_invalid = True
                break
        self.assertTrue(complete_invalid)
        subset_invalid = False
        for r in gr_subset.iter_genes():
            if r.attribute(b'gene_biotype') not in valid_biotypes:
                subset_invalid = True
                break
        self.assertFalse(subset_invalid)
        self.assertGreater(len(gr_subset), 0)
Exemple #3
0
    def test_subset_genes_produces_a_reduced_annotation_file_when_passed_fields(
            self):
        organism = "ciona_intestinalis"
        idx = index.Index(organism, ["external_gene_name"],
                          index_folder_name=self.outdir)
        idx._download_ensembl_files(ensemble_release=None)
        self.assertTrue(
            os.path.isfile(os.path.join(self.outdir, "%s.fa.gz" % organism)),
            "fasta file not found",
        )
        self.assertTrue(
            os.path.isfile(os.path.join(self.outdir, "%s.gtf.gz" % organism)),
            "gtf file not found",
        )
        self.assertTrue(
            os.path.isfile(os.path.join(self.outdir, "%s_ids.csv" % organism)),
            "id file not found",
        )

        valid_biotypes = {"protein_coding", "lincRNA"}
        idx._subset_genes(valid_biotypes=valid_biotypes)

        self.assertTrue(
            os.path.isfile(
                os.path.join(self.outdir, organism, "annotations.gtf")))
        gr_subset = gtf.Reader(
            os.path.join(self.outdir, organism, "annotations.gtf"))
        gr_complete = gtf.Reader(
            os.path.join(self.outdir, "%s.gtf.gz" % organism))
        self.assertLess(
            len(gr_subset),
            len(gr_complete),
            "Subset annotation was not smaller than the complete annotation",
        )

        # make sure only valid biotypes are returned
        complete_invalid = False

        for r in gr_complete:
            record = gtf.Record(r)
            if record.attribute("gene_biotype") not in valid_biotypes:
                complete_invalid = True
                break
        self.assertTrue(complete_invalid)

        subset_invalid = False
        for r in gr_subset:
            record = gtf.Record(r)
            if record.attribute("gene_biotype") not in valid_biotypes:
                subset_invalid = True
                break
        self.assertFalse(subset_invalid)
        self.assertGreater(len(gr_subset), 0)
Exemple #4
0
 def test_download_gtf_file_gets_a_file_readable_by_seqc_gtf_reader(self):
     idx = index.Index('ciona_intestinalis', ['entrezgene'])
     with ftplib.FTP(host='ftp.ensembl.org') as ftp:
         ftp.login()
         filename = self.outdir + 'ci.gtf.gz'
         idx._download_gtf_file(ftp, filename)
     rd = gtf.Reader(filename)
     rc = next(rd.iter_genes())
     self.assertIsInstance(rc, gtf.Gene)
     os.remove(filename)
Exemple #5
0
    def test_download_gtf_file_gets_a_file_readable_by_seqc_gtf_reader(self):

        idx = index.Index("ciona_intestinalis", ["entrezgene"])

        with ftplib.FTP(host="ftp.ensembl.org") as ftp:
            ftp.login()
            filename = self.outdir + "ci.gtf.gz"
            idx._download_gtf_file(ftp, filename, ensemble_release=99)

        rd = gtf.Reader(filename)
        (transcript_chromosome, transcript_strand,
         transcript_gene_id), exons = next(rd.iter_transcripts())

        # (('1', '+', 17842), [['1', 'ensembl', 'exon', '1636', '1902', '.', '+', '.', 'gene_id "ENSCING00000017842"; gene_version "1"; transcript_id "ENSCINT00000030147"; transcript_version "1"; exon_number "1"; gene_name "RNaseP_nuc"; gene_source "ensembl"; gene_biotype "misc_RNA"; transcript_name "RNaseP_nuc-201"; transcript_source "ensembl"; transcript_biotype "misc_RNA"; exon_id "ENSCINE00000207263"; exon_version "1";\n']])
        self.assertEqual(transcript_chromosome, "1")
        self.assertEqual(transcript_strand, "+")
        self.assertEqual(transcript_gene_id, 17842)
        self.assertEqual(len(exons), 1)
Exemple #6
0
    def test_iter_transcripts(self):
        rd = gtf.Reader(self.annotation)
        (transcript_chromosome, transcript_strand,
         transcript_gene_id), exons = next(rd.iter_transcripts())

        # this should give us 3 exons of the first transcript of the first gene found in inverse order:
        #
        # chr19  HAVANA  gene        60951  71626  .  -  .  gene_id  "ENSG00000282458.1";  gene_type      "transcribed_processed_pseudogene";  gene_status  "KNOWN";                             gene_name    "WASH5P";  level      2;         havana_gene      "OTTHUMG00000180466.8";
        # chr19  HAVANA  transcript  60951  70976  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  level        2;  tag                       "basic";              transcript_support_level  "1";                     havana_gene               "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";
        # chr19  HAVANA  exon        70928  70976  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  exon_number  1;  exon_id                   "ENSE00003781173.1";  level                     2;                       tag                       "basic";                 transcript_support_level  "1";                     havana_gene        "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";
        # chr19  HAVANA  exon        66346  66499  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  exon_number  2;  exon_id                   "ENSE00003783498.1";  level                     2;                       tag                       "basic";                 transcript_support_level  "1";                     havana_gene        "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";
        # chr19  HAVANA  exon        60951  61894  .  -  .  gene_id  "ENSG00000282458.1";  transcript_id  "ENST00000632506.1";                 gene_type    "transcribed_processed_pseudogene";  gene_status  "KNOWN";   gene_name  "WASH5P";  transcript_type  "processed_transcript";              transcript_status  "KNOWN";  transcript_name  "WASH5P-008";  exon_number  3;  exon_id                   "ENSE00003783010.1";  level                     2;                       tag                       "basic";                 transcript_support_level  "1";                     havana_gene        "OTTHUMG00000180466.8";  havana_transcript         "OTTHUMT00000471217.2";

        self.assertEqual(transcript_chromosome, "chr19")
        self.assertEqual(transcript_strand, "-")
        self.assertEqual(transcript_gene_id, 282458)
        self.assertEqual(len(exons), 3)

        # 8th column has exon ID
        self.assertIn("ENSE00003783010.1", exons[0][8])  # exon number 3
        self.assertIn("ENSE00003783498.1", exons[1][8])  # exon number 2
        self.assertIn("ENSE00003781173.1", exons[2][8])  # exon number 1
Exemple #7
0
 def get_length_of_gtf(self):
     rd = gtf.Reader(self.annotation)
     # print(len(rd))
     print(sum(1 for _ in rd.iter_transcripts()))
Exemple #8
0
 def test_num_of_transcripts(self):
     rd = gtf.Reader(self.annotation)
     num_transcripts = sum(1 for _ in rd.iter_transcripts())
     # awk -F'\t' '$3=="transcript" { print $0 }' annotations.gtf | wc -l
     self.assertEqual(num_transcripts, 12747)
Exemple #9
0
    def _subset_genes(
            self,
            conversion_file: str = None,
            gtf_file: str = None,
            truncated_annotation: str = None,
            valid_biotypes=("protein_coding", "lincRNA"),
    ):
        """
        Remove any annotation from the annotation_file that is not also defined by at
        least one additional identifer present in conversion file.

        The effect of these fields is to limit the ENSEMBL genes to a subset of genes
        which are also defined by other consortia. The rationale for requiring multiple
        definitions is that ENSEMBL has very relaxed standards, with many of its genes
        being defined based on predicted locations without any biological evidence, or,
        more importantly, any associated biological information. These such genes are
        often uninformative as a result, and are better excluded from the index.

        valid_biotypes removes genes that are of biotypes that single-cell sequencing
        is unlikely to detect. For example, miRNA are rarely poly-adenylated, and are
        of a size that they are often removed with primers. In our experience, the only
        biotypes that are worth considering are protein coding genes and lincRNA, the
        defaults for this function.

        :param conversion_file: file location of the conversion file
        :param gtf_file: file location of the annotation file
        :param truncated_annotation: name for the generated output file
        :param list(str) valid_biotypes: only accept genes of this biotype.
        """

        if gtf_file is None:
            gtf_file = os.path.join(self.index_folder_name,
                                    "{}.gtf.gz".format(self.organism))
        if conversion_file is None:
            conversion_file = os.path.join(self.index_folder_name,
                                           "{}_ids.csv".format(self.organism))
        if truncated_annotation is None:
            truncated_annotation = os.path.join(self.index_folder_name,
                                                self.organism,
                                                "annotations.gtf")

        if not (self.additional_id_types
                or valid_biotypes):  # nothing to be done
            # no need to truncate the annotation file
            # let's just make a copy of the original file so that it can be added to the final output directory
            cmd = "gunzip -c {} > {}".format(gtf_file, truncated_annotation)
            err = check_call(cmd, shell=True)
            if err:
                raise ChildProcessError("conversion file download failed: %s" %
                                        err)
            return

        # change to set for efficiency
        valid_biotypes = set(valid_biotypes)

        # extract valid ensembl ids from the conversion file
        c = pd.read_csv(conversion_file, index_col=[0])

        if c.shape[1] == 1:
            # index == ensembl_gene_id & col 1 == hgnc_symbol
            valid_ensembl_ids = set(c[np.any(~c.isnull().values,
                                             axis=1)].index)
        elif c.shape[1] == 0:
            # index == ensembl_gene_id & no columns
            # set to none to take all IDs
            valid_ensembl_ids = None
        else:
            raise Exception("Not implemented/supported shape={}".format(
                c.shape))

        # remove any invalid ids from the annotation file
        gr = gtf.Reader(gtf_file)
        with open(truncated_annotation, "wt") as f:
            for line_fields in gr:
                record = gtf.Record(line_fields)
                # include only biotypes of interest
                if record.attribute("gene_biotype") in valid_biotypes:
                    if (valid_ensembl_ids is None) or (
                            record.attribute("gene_id") in valid_ensembl_ids):
                        f.write("\t".join(line_fields))
Exemple #10
0
    def _subset_genes(self,
                      conversion_file: str = None,
                      gtf_file: str = None,
                      truncated_annotation: str = None,
                      valid_biotypes=(b'protein_coding', b'lincRNA')):
        """
        Remove any annotation from the annotation_file that is not also defined by at
        least one additional identifer present in conversion file.

        The effect of these fields is to limit the ENSEMBL genes to a subset of genes
        which are also defined by other consortia. The rationale for requiring multiple
        definitions is that ENSEMBL has very relaxed standards, with many of its genes
        being defined based on predicted locations without any biological evidence, or,
        more importantly, any associated biological information. These such genes are
        often uninformative as a result, and are better excluded from the index.

        valid_biotypes removes genes that are of biotypes that single-cell sequencing
        is unlikely to detect. For example, miRNA are rarely poly-adenylated, and are
        of a size that they are often removed with primers. In our experience, the only
        biotypes that are worth considering are protein coding genes and lincRNA, the
        defaults for this function.

        :param conversion_file: file location of the conversion file
        :param gtf_file: file location of the annotation file
        :param truncated_annotation: name for the generated output file
        :param list(bytes) valid_biotypes: only accept genes of this biotype.
        """
        if not (self.additional_id_types
                or valid_biotypes):  # nothing to be done
            return

        # change to set for efficiency
        if all(isinstance(t, str) for t in valid_biotypes):
            valid_biotypes = set((t.encode() for t in valid_biotypes))
        elif all(isinstance(t, bytes) for t in valid_biotypes):
            valid_biotypes = set(valid_biotypes)
        else:
            raise TypeError(
                'mixed-type biotypes detected. Please pass valid_biotypes '
                'as strings or bytes objects (but not both).')

        if gtf_file is None:
            gtf_file = '%s/%s.gtf.gz' % (self.index_folder_name, self.organism)
        if conversion_file is None:
            conversion_file = '%s/%s_ids.csv' % (self.index_folder_name,
                                                 self.organism)
        if truncated_annotation is None:
            truncated_annotation = '%s/%s_multiconsortia.gtf' % (
                self.index_folder_name, self.organism)

        # extract valid ensembl ids from the conversion file
        c = pd.read_csv(conversion_file, index_col=[0])
        valid_ensembl_ids = set(c[np.any(~c.isnull().values, axis=1)].index)

        # remove any invalid ids from the annotation file
        gr = gtf.Reader(gtf_file)
        with open(truncated_annotation, 'wb') as f:
            for line_fields in gr:
                record = gtf.Record(line_fields)
                if (record.attribute(b'gene_id').decode() in valid_ensembl_ids
                        and record.attribute(b'gene_biotype')
                        in valid_biotypes):
                    f.write(bytes(record))