def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
Exemple #2
0
    def test_gencode_small(self):
        """Test that we can create Transcript instances from a small gencode gtf and fasta."""
        gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v19.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa"
        base_output_filename = "out/test_small_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)

        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["MAPK1"]
        self.assertTrue(len(transcripts) == 4)

        seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://", optimize=False)
        transcripts = seq_index_gp["22_753"]
        self.assertTrue(transcripts[0].get_strand() == "-")
        self.assertTrue(len(transcripts) == 1)

        for tx in transcripts:
            if tx.get_transcript_id() != "ENST00000215832.6":
                continue
            self.assertTrue(tx.get_seq().startswith("AGGCAATCGGTCCGAG"))
Exemple #3
0
    def test_build_ensembl_transcript_index(self):
        """Build the gtf portion of the ensembl transcript db
        """
        # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g"
        #  snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W,
        #
        # grep -Pzo  ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa
        #
        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        output_filename = "out/test_ensembl_gtf.db"
        protocol = "file"
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol)
        self.assertTrue(os.path.exists(output_filename))

        shove = Shove(protocol + "://" + output_filename, "memory://")
        self.assertTrue(len(shove.keys()) > 0)
        self.assertTrue("YDR529C" in shove.keys())
        t = shove["YDR529C"]
        self.assertTrue(t.get_seq() is not None)
        self.assertTrue(t.get_seq() is not "")
        self.assertTrue(len(t.get_cds()) > 0)
        self.assertTrue(len(t.get_exons()) > 0)
        MutUtils.removeDir(output_filename)
    def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None):
        """Create the transcript index (using shove) for ensembl.  Key is transcript ID.

        Note:  This method will hold the entire transcript index in RAM.

        :param ensembl_input_gtfs: (list)
        :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs
        :param output_filename:
        :param protocol: shove protocol.  Usually "file" or "sqlite"
        """

        # Example code taken from http://biopython.org/wiki/GFF_Parsing
        shove = Shove(protocol + "://" + output_filename, "memory://")
        logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename)

        # Get the transcript ID to protein ID mapping
        tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(protein_id_mapping_file)

        seq_dict = {}
        for in_seq_file in ensembl_input_fastas:
            in_seq_handle = open(in_seq_file)
            seq_dict.update(self._create_seq_dict(in_seq_handle))
            in_seq_handle.close()
            logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file)

        for file_ctr, in_file in enumerate(ensembl_input_gtfs):
            in_handle = open(in_file)
            seq_dict_keys = seq_dict.keys()
            ctr = 0
            for rec in GFF.parse_simple(in_file): #(in_handle, base_dict=seq_dict):

                # transcript id seems to always be a list of length 1
                if len(rec['quals']['transcript_id']) > 1:
                    logging.getLogger(__name__).warn("ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id']))

                self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping)
                ctr += 1
                if (ctr % 10000) == 0:
                    logging.getLogger(__name__).info("Added " + str(ctr) + " lines of gtf " + str(file_ctr+1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.")
            in_handle.close()
            logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")")

        logging.getLogger(__name__).info("Populating final db with internal transcript index.")
        transcript_index_keys = self._transcript_index.keys()
        for i,k in enumerate(transcript_index_keys):

            # Populate the protein sequence
            protein_sequence = self._determine_protein_seq(self._transcript_index[k])
            self._transcript_index[k].set_protein_seq(protein_sequence)

            shove[k] = self._transcript_index[k]
            if i % 10000 == 0:
                logging.getLogger(__name__).info("Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i*100)/float(len(transcript_index_keys))))

        logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename)
        shove.close()
    def build_ensembl_transcripts_by_genomic_location_index(
            self,
            ensembl_transcript_index_fname,
            output_filename,
            protocol="file"):
        """Create an index for genomic position to transcripts index, using a transcript index created in
            build_ensembl_transcript_index
        """
        transcript_db = Shove(protocol + "://" +
                              ensembl_transcript_index_fname)
        output_db = Shove(protocol + "://" + output_filename, optimize=False)

        transcript_keys = transcript_db.keys()

        for i, tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            start = tx.get_start()
            end = tx.get_end()
            genomic_location_bin = region2bin(start, end)
            key = tx.get_contig() + "_" + str(genomic_location_bin)
            try:
                tmpList = output_db[key]
            except KeyError:
                output_db[key] = []
                tmpList = output_db[key]

            tmpList.append(tx)
            output_db[key] = tmpList
            if (i + 1) % 10000 == 0:
                logging.getLogger(
                    __name__).info("Genomic position index added " + str(i) +
                                   " transcripts so far.")

        output_db.close()
        transcript_db.close()
    def build_ensembl_transcripts_by_gene_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"):
        """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index
        :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db
        :return:
        """

        #TODO: This may need to be moved to the init of the transcript datasource as that may be faster.

        transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname, "memory://")
        output_db = Shove(protocol + "://" + output_filename, "memory://", optimize=False)

        transcript_keys = transcript_db.keys()

        for i,tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            gene = tx.get_gene()
            try:
                tmpList = output_db[gene]
            except KeyError:
                output_db[gene] = []
                tmpList = output_db[gene]
            tmpList.append(tx)
            output_db[gene] = tmpList
            if (i+1) % 10000 == 0:
                logging.getLogger(__name__).info("Gene index added " + str(i) + " transcripts so far.")
        logging.getLogger(__name__).info("Finished gene index with " + str(len(output_db.keys())) + " genes.")
        output_db.close()
        transcript_db.close()
    def build_ensembl_transcripts_by_genomic_location_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"):
        """Create an index for genomic position to transcripts index, using a transcript index created in
            build_ensembl_transcript_index
        """
        transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname)
        output_db = Shove(protocol + "://" + output_filename, optimize=False)

        transcript_keys = transcript_db.keys()

        for i,tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            start = tx.get_start()
            end = tx.get_end()
            genomic_location_bin = region2bin(start, end)
            key = tx.get_contig() + "_" + str(genomic_location_bin)
            try:
                tmpList = output_db[key]
            except KeyError:
                output_db[key] = []
                tmpList = output_db[key]

            tmpList.append(tx)
            output_db[key] = tmpList
            if (i+1) % 10000 == 0:
                logging.getLogger(__name__).info("Genomic position index added " + str(i) + " transcripts so far.")

        output_db.close()
        transcript_db.close()
def parse_with_shove(fname, callableParsingFunction, pickleDir=""):
    ''' Pickle dir MUST include appended "/" '''
    shoveFilename = pickleDir + os.path.basename(fname) + ".shv"
    if os.path.exists(shoveFilename):
        logging.getLogger(__name__).info("Loading shove structure: " +
                                         str(shoveFilename))
        g = Shove("file://" + shoveFilename, "simple://")
    else:
        logging.getLogger(__name__).info("Parsing...")
        tmpStruct = callableParsingFunction(file(fname, 'r'))
        logging.getLogger(__name__).info("Writing shove db: " +
                                         str(shoveFilename))
        ks = tmpStruct.keys()
        g = Shove("file://" + shoveFilename)
        for k in ks:
            del tmpStruct[
                k].references  # May be causing an error later down the road.
            g[k] = tmpStruct[k]
    return g
Exemple #9
0
    def test_retrieving_sequence(self):
        """Ensure we can retrieve a sequence from an ensembl transcript given a gene.  """

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"
        base_output_filename = "out/test_retrieving_full_indices_ensembl"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)
        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename)

        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", optimize=False)
        transcripts = seq_index['SEO1']
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL067C":
                break
        self.assertTrue(transcript.get_seq().startswith('ATGTATTCAATTGTTAAAGAGATTATTGTAGATCCTTACAAAAGACTAAAATGGGGTTTT'))

        transcripts = seq_index['PAU8']
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL068C":
                break
        self.assertTrue(transcript.get_strand() == "-")

        seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://")
        transcripts = seq_index_gp["I_585"]
        self.assertTrue(len(transcripts) == 5, "There should be 5 transcripts.")
        transcript = transcripts[0]
        for i in xrange(len(transcripts)):
            transcript = transcripts[i]
            if transcript._transcript_id == "YAL069W":
                break
        self.assertTrue(transcript.get_strand() == "+")
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)
Exemple #11
0
    def test_build_ensembl_transcripts_by_genomic_location_index(self):
        """Test that we can get an ensembl transcript from a genomic position"""
        protocol = "file"
        transcript_index_filename = "out/test_ensemble_gtf_for_gp.db"
        output_filename = "out/test_ensemble_gtf_for_gp.db.idx"
        shutil.rmtree(output_filename, ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol)

        # Now load the index and look something up.
        gp_index = Shove(protocol + "://" + output_filename)
        gt_transcript_id = "YAL067C"
        bins = region2bins(1496172, 1496400)

        for bin in bins:
            key = 'I_' + str(bin)
            if key in gp_index.keys():
                self.assertTrue(gp_index[key] == gt_transcript_id)
    def build_ensembl_transcripts_by_gene_index(self,
                                                ensembl_transcript_index_fname,
                                                output_filename,
                                                protocol="file"):
        """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index
        :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db
        :return:
        """

        #TODO: This may need to be moved to the init of the transcript datasource as that may be faster.

        transcript_db = Shove(
            protocol + "://" + ensembl_transcript_index_fname, "memory://")
        output_db = Shove(protocol + "://" + output_filename,
                          "memory://",
                          optimize=False)

        transcript_keys = transcript_db.keys()

        for i, tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            gene = tx.get_gene()
            try:
                tmpList = output_db[gene]
            except KeyError:
                output_db[gene] = []
                tmpList = output_db[gene]
            tmpList.append(tx)
            output_db[gene] = tmpList
            if (i + 1) % 10000 == 0:
                logging.getLogger(__name__).info("Gene index added " + str(i) +
                                                 " transcripts so far.")
        logging.getLogger(__name__).info("Finished gene index with " +
                                         str(len(output_db.keys())) +
                                         " genes.")
        output_db.close()
        transcript_db.close()
Exemple #13
0
    def test_multiple_gtf_initialization(self):
        """Test that we can create a datasource from multiple gtf & fastas"""
        gencode_input_gtfs = ["testdata/gencode/CP.gencode.v19.annotation.gtf", "testdata/gencode/MAPK1.gencode.v19.annotation.gtf"]
        gencode_input_fastas = ["testdata/gencode/CP.gencode.v19.pc_transcripts.fa", "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa"]
        base_output_filename = "out/test_multi_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices(gencode_input_gtfs, gencode_input_fastas, base_output_filename)
        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["CP"]
        self.assertTrue(len(transcripts) == 15)
        transcripts = seq_index["MAPK1"]
        self.assertTrue(len(transcripts) == 4)
        for tx in transcripts:
            self.assertTrue(tx.get_transcript_id() == "ENST00000491588.1" or len(tx.get_seq()) > 100, "No seq data for " + tx.get_transcript_id() )
Exemple #14
0
    def test_build_ensembl_transcripts_by_gene_index(self):
        """Test building an index for getting a transcript given a gene."""
        protocol = "file"
        transcript_index_filename = "out/test_ensembl_gtf_for_gene.db"
        output_filename = "out/test_ensembl_gtf_for_gene.db.gene.idx"
        shutil.rmtree(output_filename,ignore_errors=True)

        ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf"
        ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa"

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol)
        genome_build_factory.build_ensembl_transcripts_by_gene_index(transcript_index_filename, output_filename)

        # Now load the index and look something up.
        gene_index = Shove(protocol + "://" + output_filename, optimize=False)
        self.assertTrue(len(gene_index['SEO1']) == 1)
        tx = gene_index['SEO1'][0]

        self.assertTrue(tx.get_transcript_id()=="YAL067C")
Exemple #15
0
    def test_gencode_cp(self):
        """Test the indexing of a gene that was causing problems and make sure that it can be indexed."""
        gencode_input_gtf = "testdata/gencode/CP.gencode.v19.annotation.gtf"
        gencode_input_fasta = "testdata/gencode/CP.gencode.v19.pc_transcripts.fa"
        base_output_filename = "out/test_cp_gencode"
        shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True)
        shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True)

        genome_build_factory = GenomeBuildFactory()
        genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename)
        seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False)
        transcripts = seq_index["CP"]

        self.assertTrue(len(transcripts) == 15)
        troubled_transcript = "ENST00000474204.1"
        is_troubled_transcript_seen = False
        for tx in transcripts:
            if tx.get_transcript_id() == troubled_transcript:
                is_troubled_transcript_seen = True
                break
        self.assertTrue(is_troubled_transcript_seen)
    def build_ensembl_transcript_index(self,
                                       ensembl_input_gtfs,
                                       ensembl_input_fastas,
                                       output_filename,
                                       protocol="file",
                                       protein_id_mapping_file=None):
        """Create the transcript index (using shove) for ensembl.  Key is transcript ID.

        Note:  This method will hold the entire transcript index in RAM.

        :param ensembl_input_gtfs: (list)
        :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs
        :param output_filename:
        :param protocol: shove protocol.  Usually "file" or "sqlite"
        """

        # Example code taken from http://biopython.org/wiki/GFF_Parsing
        shove = Shove(protocol + "://" + output_filename, "memory://")
        logging.getLogger(__name__).info("Transcript index being created: " +
                                         protocol + "://" + output_filename)

        # Get the transcript ID to protein ID mapping
        tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(
            protein_id_mapping_file)

        seq_dict = {}
        for in_seq_file in ensembl_input_fastas:
            in_seq_handle = open(in_seq_file)
            seq_dict.update(self._create_seq_dict(in_seq_handle))
            in_seq_handle.close()
            logging.getLogger(__name__).info("Parsed fasta file: " +
                                             in_seq_file)

        for file_ctr, in_file in enumerate(ensembl_input_gtfs):
            in_handle = open(in_file)
            seq_dict_keys = seq_dict.keys()
            ctr = 0
            for rec in GFF.parse_simple(
                    in_file):  #(in_handle, base_dict=seq_dict):

                # transcript id seems to always be a list of length 1
                if len(rec['quals']['transcript_id']) > 1:
                    logging.getLogger(__name__).warn(
                        "ensembl records had more than one transcript id: " +
                        str(rec['quals']['transcript_id']))

                self._convertGFFRecordToTranscript(rec, seq_dict,
                                                   seq_dict_keys,
                                                   tx_to_protein_mapping)
                ctr += 1
                if (ctr % 10000) == 0:
                    logging.getLogger(__name__).info(
                        "Added " + str(ctr) + " lines of gtf " +
                        str(file_ctr + 1) + " of " +
                        str(len(ensembl_input_gtfs)) + " (" + in_file +
                        ") into internal transcript index.")
            in_handle.close()
            logging.getLogger(__name__).info("Finished " + str(ctr) +
                                             " lines of gtf (" + in_file + ")")

        logging.getLogger(__name__).info(
            "Populating final db with internal transcript index.")
        transcript_index_keys = self._transcript_index.keys()
        for i, k in enumerate(transcript_index_keys):

            # Populate the protein sequence
            protein_sequence = self._determine_protein_seq(
                self._transcript_index[k])
            self._transcript_index[k].set_protein_seq(protein_sequence)

            shove[k] = self._transcript_index[k]
            if i % 10000 == 0:
                logging.getLogger(__name__).info(
                    "Saved %0.1f%% of transcript index to disk with protein sequence."
                    % (float(i * 100) / float(len(transcript_index_keys))))

        logging.getLogger(__name__).info("Transcript index created " +
                                         str(len(shove.keys())) +
                                         " transcripts: " + protocol + "://" +
                                         output_filename)
        shove.close()