def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
Exemple #2
0
    def test_gencode_small(self):
        """Test that we can create Transcript instances from a small gencode gtf and fasta."""
        gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v19.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa"
        base_output_filename = "out/test_small_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)

        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["MAPK1"]
        self.assertTrue(len(transcripts) == 4)

        seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://", optimize=False)
        transcripts = seq_index_gp["22_753"]
        self.assertTrue(transcripts[0].get_strand() == "-")
        self.assertTrue(len(transcripts) == 1)

        for tx in transcripts:
            if tx.get_transcript_id() != "ENST00000215832.6":
                continue
            self.assertTrue(tx.get_seq().startswith("AGGCAATCGGTCCGAG"))
Exemple #3
0
    def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
Exemple #4
0
 def _create_test_gencode_ds(base_output_filename, protein_id_mapping_file,
                             gencode_version):
     genes = [
         "MAPK1", "MUC16", "PIK3CA", "YPEL1", "KRTAP4-7", "MAT2A",
         "DDX11L10"
     ]
     gtf_list = []
     fasta_list = []
     for gene in genes:
         gtf_list.append("testdata/gencode/" + gene + ".gencode.v" +
                         str(gencode_version) + ".annotation.gtf")
         fasta_list.append("testdata/gencode/" + gene + ".gencode.v" +
                           str(gencode_version) + ".pc_transcripts.fa")
     shutil.rmtree(base_output_filename + ".transcript.idx",
                   ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gene.idx",
                   ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx",
                   ignore_errors=True)
     genome_build_factory = GenomeBuildFactory()
     genome_build_factory.construct_ensembl_indices(
         gtf_list,
         fasta_list,
         base_output_filename,
         protein_id_mapping_file=protein_id_mapping_file)
     ensembl_ds = EnsemblTranscriptDatasource(base_output_filename,
                                              title="GENCODE",
                                              version="v" +
                                              str(gencode_version),
                                              tx_filter="basic")
     return ensembl_ds
    def test_gencode_small(self):
        """Test that we can create Transcript instances from a small gencode gtf and fasta."""
        gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa"
        base_output_filename = "out/test_small_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)

        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["MAPK1"]
        self.assertTrue(len(transcripts) == 4)

        seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://", optimize=False)
        transcripts = seq_index_gp["22_753"]
        self.assertTrue(transcripts[0].get_strand() == "-")
        self.assertTrue(len(transcripts) == 1)

        for tx in transcripts:
            if tx.get_transcript_id() != "ENST00000215832.6":
                continue
            self.assertTrue(tx.get_seq().startswith("AGGCAATCGGTCCGAG"))
 def _create_ensembl_ds_from_saccer(self):
     gencode_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
     gencode_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
     base_output_filename = "out/test_saccer_ds"
     shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
     genome_build_factory = GenomeBuildFactory()
     genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
     ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="ensembl", version="71")
     return ensembl_ds
 def _create_ensembl_ds_from_saccer(self):
     gencode_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
     gencode_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
     base_output_filename = "out/test_saccer_ds"
     shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
     genome_build_factory = GenomeBuildFactory()
     genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
     ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="ensembl", version="71")
     return ensembl_ds
 def _create_ensembl_ds_from_testdata(self, gene):
     gencode_input_gtf = "testdata/gencode/" + gene + ".gencode.v18.annotation.gtf"
     gencode_input_fasta = "testdata/gencode/" + gene + ".gencode.v18.pc_transcripts.fa"
     base_output_filename = "out/test_variant_classification"
     shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
     genome_build_factory = GenomeBuildFactory()
     genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
     ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="GENCODE", version="v18")
     return ensembl_ds
Exemple #9
0
    def test_construct_full_indices(self):
        """Attempt to construct all three ensembl indices with one command. """
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
        base_output_filename = "out/test_full_indices_ensembl"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename)

        self.assertTrue(os.path.exists(base_output_filename + ".transcript.idx"))
        self.assertTrue(os.path.exists(base_output_filename + ".transcript_by_gene.idx"))
        self.assertTrue(os.path.exists(base_output_filename + ".transcript_by_gp_bin.idx"))
    def test_construct_full_indices(self):
        """Attempt to construct all three ensembl indices with one command. """
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
        base_output_filename = "out/test_full_indices_ensembl"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename)

        self.assertTrue(os.path.exists(base_output_filename + ".transcript.idx"))
        self.assertTrue(os.path.exists(base_output_filename + ".transcript_by_gene.idx"))
        self.assertTrue(os.path.exists(base_output_filename + ".transcript_by_gp_bin.idx"))
Exemple #11
0
 def _create_test_gencode_ds(base_output_filename, protein_id_mapping_file="testdata/gencode/ensembl_id_mappingsGRCh37.p13.txt"):
     genes = ["MAPK1", "MUC16", "PIK3CA", "YPEL1", "KRTAP4-7", "MAT2A"]
     gtf_list = []
     fasta_list = []
     for gene in genes:
         gtf_list.append("testdata/gencode/" + gene + ".gencode.v18.annotation.gtf")
         fasta_list.append("testdata/gencode/" + gene + ".gencode.v18.pc_transcripts.fa")
     shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
     shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
     genome_build_factory = GenomeBuildFactory()
     genome_build_factory.construct_ensembl_indices(gtf_list, fasta_list, base_output_filename, protein_id_mapping_file=protein_id_mapping_file)
     ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="GENCODE", version="v18", tx_filter="basic")
     return ensembl_ds
    def test_multiple_gtf_initialization(self):
        """Test that we can create a datasource from multiple gtf & fastas"""
        gencode_input_gtfs = ["testdata/gencode/CP.gencode.annotation.gtf", "testdata/gencode/MAPK1.gencode.v18.annotation.gtf"]
        gencode_input_fastas = ["testdata/gencode/CP.gencode.pc_transcripts.fa", "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa"]
        base_output_filename = "out/test_multi_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices(gencode_input_gtfs, gencode_input_fastas, base_output_filename)
        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["CP"]
        self.assertTrue(len(transcripts) == 15)
        transcripts = seq_index["MAPK1"]
        self.assertTrue(len(transcripts) == 4)
        for tx in transcripts:
            self.assertTrue(tx.get_transcript_id() == "ENST00000491588.1" or len(tx.get_seq()) > 100, "No seq data for " + tx.get_transcript_id() )
    def test_convert_genomic_space_to_exon_space(self, loc, gt_d):
        """Test genomic --> exon transform on real data. """
        gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa"
        base_output_filename = "out/test_variant_classification"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
        ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST")
        tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790")

        start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0])
        loc_length = (int(loc[1]) - int(loc[0]))
        self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length))
        self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + "   exons: " + str(tx[0].get_exons()))
Exemple #14
0
    def test_multiple_gtf_initialization(self):
        """Test that we can create a datasource from multiple gtf & fastas"""
        gencode_input_gtfs = ["testdata/gencode/CP.gencode.v19.annotation.gtf", "testdata/gencode/MAPK1.gencode.v19.annotation.gtf"]
        gencode_input_fastas = ["testdata/gencode/CP.gencode.v19.pc_transcripts.fa", "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa"]
        base_output_filename = "out/test_multi_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices(gencode_input_gtfs, gencode_input_fastas, base_output_filename)
        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["CP"]
        self.assertTrue(len(transcripts) == 15)
        transcripts = seq_index["MAPK1"]
        self.assertTrue(len(transcripts) == 4)
        for tx in transcripts:
            self.assertTrue(tx.get_transcript_id() == "ENST00000491588.1" or len(tx.get_seq()) > 100, "No seq data for " + tx.get_transcript_id() )
    def test_build_ensembl_transcripts_by_gene_index(self):
        """Test building an index for getting a transcript given a gene."""
        protocol = "file"
        transcript_index_filename = "out/test_ensembl_gtf_for_gene.db"
        output_filename = "out/test_ensembl_gtf_for_gene.db.gene.idx"
        shutil.rmtree(output_filename,ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_gene_index(transcript_index_filename, output_filename)

        # Now load the index and look something up.
        gene_index = Shove(protocol + "://" + output_filename, optimize=False)
        self.assertTrue(len(gene_index['SEO1']) == 1)
        tx = gene_index['SEO1'][0]

        self.assertTrue(tx.get_transcript_id()=="YAL067C")
    def test_gencode_cp(self):
        """Test the indexing of a gene that was causing problems and make sure that it can be indexed."""
        gencode_input_gtf = "testdata/gencode/CP.gencode.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/CP.gencode.pc_transcripts.fa"
        base_output_filename = "out/test_cp_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["CP"]

        self.assertTrue(len(transcripts) == 15)
        troubled_transcript = "ENST00000474204.1"
        is_troubled_transcript_seen = False
        for tx in transcripts:
            if tx.get_transcript_id() == troubled_transcript:
                is_troubled_transcript_seen = True
                break
        self.assertTrue(is_troubled_transcript_seen)
Exemple #17
0
    def test_gencode_cp(self):
        """Test the indexing of a gene that was causing problems and make sure that it can be indexed."""
        gencode_input_gtf = "testdata/gencode/CP.gencode.v19.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/CP.gencode.v19.pc_transcripts.fa"
        base_output_filename = "out/test_cp_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["CP"]

        self.assertTrue(len(transcripts) == 15)
        troubled_transcript = "ENST00000474204.1"
        is_troubled_transcript_seen = False
        for tx in transcripts:
            if tx.get_transcript_id() == troubled_transcript:
                is_troubled_transcript_seen = True
                break
        self.assertTrue(is_troubled_transcript_seen)
    def test_retrieving_sequence(self):
        """Ensure we can retrieve a sequence from an ensembl transcript given a gene.  """

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
        base_output_filename = "out/test_retrieving_full_indices_ensembl"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename)

        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", optimize=False)
        transcripts = seq_index['SEO1']
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL067C":
                break
        self.assertTrue(transcript.get_seq().startswith('ATGTATTCAATTGTTAAAGAGATTATTGTAGATCCTTACAAAAGACTAAAATGGGGTTTT'))

        transcripts = seq_index['PAU8']
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL068C":
                break
        self.assertTrue(transcript.get_strand() == "-")

        seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://")
        transcripts = seq_index_gp["I_585"]
        self.assertTrue(len(transcripts) == 5, "There should be 5 transcripts.")
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL069W":
                break
        self.assertTrue(transcript.get_strand() == "+")
Exemple #19
0
    def test_retrieving_sequence(self):
        """Ensure we can retrieve a sequence from an ensembl transcript given a gene.  """

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
        base_output_filename = "out/test_retrieving_full_indices_ensembl"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename)

        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", optimize=False)
        transcripts = seq_index['SEO1']
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL067C":
                break
        self.assertTrue(transcript.get_seq().startswith('ATGTATTCAATTGTTAAAGAGATTATTGTAGATCCTTACAAAAGACTAAAATGGGGTTTT'))

        transcripts = seq_index['PAU8']
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL068C":
                break
        self.assertTrue(transcript.get_strand() == "-")

        seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://")
        transcripts = seq_index_gp["I_585"]
        self.assertTrue(len(transcripts) == 5, "There should be 5 transcripts.")
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL069W":
                break
        self.assertTrue(transcript.get_strand() == "+")
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)
Exemple #21
0
    def test_build_ensembl_transcripts_by_gene_index(self):
        """Test building an index for getting a transcript given a gene."""
        protocol = "file"
        transcript_index_filename = "out/test_ensembl_gtf_for_gene.db"
        output_filename = "out/test_ensembl_gtf_for_gene.db.gene.idx"
        shutil.rmtree(output_filename,ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_gene_index(transcript_index_filename, output_filename)

        # Now load the index and look something up.
        gene_index = Shove(protocol + "://" + output_filename, optimize=False)
        self.assertTrue(len(gene_index['SEO1']) == 1)
        tx = gene_index['SEO1'][0]

        self.assertTrue(tx.get_transcript_id()=="YAL067C")
Exemple #22
0
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)
def main():
    setup_logging()
    args = parseOptions()
    gtf_files = args.gtf_files.split(",")
    fasta_files = args.fasta_files.split(",")
    output_dir = args.output_dir
    genome_build = args.genome_build
    name = args.name
    ver = args.version
    tx_filter = args.filter
    protein_map_file = args.protein_map_file

    # create temp dir
    tmpDir = tempfile.mkdtemp(prefix="onco_ensembl_ds_")
    try:
        logging.getLogger(__name__).info("Creating tmp dir (" + tmpDir + ") ....")
        ds_build_dir = tmpDir + "/" + genome_build + "/"
        os.mkdir(ds_build_dir)

        if not (args.gtf_files.lower().find("gencode") !=-1) and tx_filter == "basic":
            logging.getLogger(__name__).warn("basic filter requested for (apparently) a non-gencode set of GTFs.  If this is an ENSEMBL run (not GENCODE), please specify dummy, using --filter.")

        logging.getLogger(__name__).info("Creating config file...")
        config_filename = ds_build_dir + "/" + name + ".config"
        logging.getLogger(__name__).info("config file being written to: " + os.path.abspath(config_filename))

        config_file_creator = GenericTsvDatasourceCreator()
        idx_cols = DatasourceInstallUtils.indexCols("dummy_option", "dummy_values")
        config_file_creator._createConfigFile(configFilename=config_filename + ".tmp", baseDSFile=os.path.basename(gtf_files[0]),ds_type="ensembl", ds_version=ver, ds_name=name, indexCols=idx_cols)

        # Append the tx_filter and protein map file
        config_parser = SafeConfigParser()
        fp = file(config_filename + ".tmp", 'r')
        config_parser.readfp(fp)
        fp.close()
        config_parser.set("general", "transcript_filter", tx_filter)

        # Write updated config file
        fp = file(config_filename, 'w')
        config_parser.write(fp)
        fp.close()

        logging.getLogger(__name__).info("Starting index construction (temp location: " + ds_build_dir + ") ...")
        factory = GenomeBuildFactory()
        factory.construct_ensembl_indices(gtf_files, fasta_files, ds_build_dir + os.path.basename(gtf_files[0]), protein_id_mapping_file=protein_map_file)

        logging.getLogger(__name__).info("Creating datasource md5...")
        DatasourceInstallUtils.create_datasource_md5_file(ds_build_dir)


        logging.getLogger(__name__).info("Copying created datasource from temp directory to final location (" + output_dir + ")...")
        shutil.copytree(symlinks=True, src=tmpDir, dst=output_dir)

    except Exception as e:
        import traceback
        logging.getLogger(__name__).fatal((e.__repr__()) + " " + traceback.format_exc())
        logging.getLogger(__name__).info(""""If you are getting and error such as:  KeyError: 'ENST00000474204.1'), then you may be out of disk space in /tmp/.""")

    # Remove the tempdir
    logging.getLogger(__name__).info("Done...")
    logging.getLogger(__name__).info("Removing ..." + tmpDir + '/')
    shutil.rmtree(tmpDir)