Esempio n. 1
0
    def build_ensembl_transcripts_by_genomic_location_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"):
        """Create an index for genomic position to transcripts index, using a transcript index created in
            build_ensembl_transcript_index
        """
        transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname)
        output_db = Shove(protocol + "://" + output_filename, optimize=False)

        transcript_keys = transcript_db.keys()

        for i,tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            start = tx.get_start()
            end = tx.get_end()
            genomic_location_bin = region2bin(start, end)
            key = tx.get_contig() + "_" + str(genomic_location_bin)
            try:
                tmpList = output_db[key]
            except KeyError:
                output_db[key] = []
                tmpList = output_db[key]

            tmpList.append(tx)
            output_db[key] = tmpList
            if (i+1) % 10000 == 0:
                logging.getLogger(__name__).info("Genomic position index added " + str(i) + " transcripts so far.")

        output_db.close()
        transcript_db.close()
Esempio n. 2
0
    def build_ensembl_transcripts_by_gene_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"):
        """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index
        :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db
        :return:
        """

        #TODO: This may need to be moved to the init of the transcript datasource as that may be faster.

        transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname, "memory://")
        output_db = Shove(protocol + "://" + output_filename, "memory://", optimize=False)

        transcript_keys = transcript_db.keys()

        for i,tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            gene = tx.get_gene()
            try:
                tmpList = output_db[gene]
            except KeyError:
                output_db[gene] = []
                tmpList = output_db[gene]
            tmpList.append(tx)
            output_db[gene] = tmpList
            if (i+1) % 10000 == 0:
                logging.getLogger(__name__).info("Gene index added " + str(i) + " transcripts so far.")
        logging.getLogger(__name__).info("Finished gene index with " + str(len(output_db.keys())) + " genes.")
        output_db.close()
        transcript_db.close()
    def build_ensembl_transcripts_by_genomic_location_index(
            self,
            ensembl_transcript_index_fname,
            output_filename,
            protocol="file"):
        """Create an index for genomic position to transcripts index, using a transcript index created in
            build_ensembl_transcript_index
        """
        transcript_db = Shove(protocol + "://" +
                              ensembl_transcript_index_fname)
        output_db = Shove(protocol + "://" + output_filename, optimize=False)

        transcript_keys = transcript_db.keys()

        for i, tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            start = tx.get_start()
            end = tx.get_end()
            genomic_location_bin = region2bin(start, end)
            key = tx.get_contig() + "_" + str(genomic_location_bin)
            try:
                tmpList = output_db[key]
            except KeyError:
                output_db[key] = []
                tmpList = output_db[key]

            tmpList.append(tx)
            output_db[key] = tmpList
            if (i + 1) % 10000 == 0:
                logging.getLogger(
                    __name__).info("Genomic position index added " + str(i) +
                                   " transcripts so far.")

        output_db.close()
        transcript_db.close()
Esempio n. 4
0
    def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None):
        """Create the transcript index (using shove) for ensembl.  Key is transcript ID.

        Note:  This method will hold the entire transcript index in RAM.

        :param ensembl_input_gtfs: (list)
        :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs
        :param output_filename:
        :param protocol: shove protocol.  Usually "file" or "sqlite"
        """

        # Example code taken from http://biopython.org/wiki/GFF_Parsing
        shove = Shove(protocol + "://" + output_filename, "memory://")
        logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename)

        # Get the transcript ID to protein ID mapping
        tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(protein_id_mapping_file)

        seq_dict = {}
        for in_seq_file in ensembl_input_fastas:
            in_seq_handle = open(in_seq_file)
            seq_dict.update(self._create_seq_dict(in_seq_handle))
            in_seq_handle.close()
            logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file)

        for file_ctr, in_file in enumerate(ensembl_input_gtfs):
            in_handle = open(in_file)
            seq_dict_keys = seq_dict.keys()
            ctr = 0
            for rec in GFF.parse_simple(in_file): #(in_handle, base_dict=seq_dict):

                # transcript id seems to always be a list of length 1
                if len(rec['quals']['transcript_id']) > 1:
                    logging.getLogger(__name__).warn("ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id']))

                self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping)
                ctr += 1
                if (ctr % 10000) == 0:
                    logging.getLogger(__name__).info("Added " + str(ctr) + " lines of gtf " + str(file_ctr+1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.")
            in_handle.close()
            logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")")

        logging.getLogger(__name__).info("Populating final db with internal transcript index.")
        transcript_index_keys = self._transcript_index.keys()
        for i,k in enumerate(transcript_index_keys):

            # Populate the protein sequence
            protein_sequence = self._determine_protein_seq(self._transcript_index[k])
            self._transcript_index[k].set_protein_seq(protein_sequence)

            shove[k] = self._transcript_index[k]
            if i % 10000 == 0:
                logging.getLogger(__name__).info("Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i*100)/float(len(transcript_index_keys))))

        logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename)
        shove.close()
    def build_ensembl_transcripts_by_gene_index(self,
                                                ensembl_transcript_index_fname,
                                                output_filename,
                                                protocol="file"):
        """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index
        :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db
        :return:
        """

        #TODO: This may need to be moved to the init of the transcript datasource as that may be faster.

        transcript_db = Shove(
            protocol + "://" + ensembl_transcript_index_fname, "memory://")
        output_db = Shove(protocol + "://" + output_filename,
                          "memory://",
                          optimize=False)

        transcript_keys = transcript_db.keys()

        for i, tx_id in enumerate(transcript_keys):
            tx = transcript_db[tx_id]
            gene = tx.get_gene()
            try:
                tmpList = output_db[gene]
            except KeyError:
                output_db[gene] = []
                tmpList = output_db[gene]
            tmpList.append(tx)
            output_db[gene] = tmpList
            if (i + 1) % 10000 == 0:
                logging.getLogger(__name__).info("Gene index added " + str(i) +
                                                 " transcripts so far.")
        logging.getLogger(__name__).info("Finished gene index with " +
                                         str(len(output_db.keys())) +
                                         " genes.")
        output_db.close()
        transcript_db.close()
    def build_ensembl_transcript_index(self,
                                       ensembl_input_gtfs,
                                       ensembl_input_fastas,
                                       output_filename,
                                       protocol="file",
                                       protein_id_mapping_file=None):
        """Create the transcript index (using shove) for ensembl.  Key is transcript ID.

        Note:  This method will hold the entire transcript index in RAM.

        :param ensembl_input_gtfs: (list)
        :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs
        :param output_filename:
        :param protocol: shove protocol.  Usually "file" or "sqlite"
        """

        # Example code taken from http://biopython.org/wiki/GFF_Parsing
        shove = Shove(protocol + "://" + output_filename, "memory://")
        logging.getLogger(__name__).info("Transcript index being created: " +
                                         protocol + "://" + output_filename)

        # Get the transcript ID to protein ID mapping
        tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(
            protein_id_mapping_file)

        seq_dict = {}
        for in_seq_file in ensembl_input_fastas:
            in_seq_handle = open(in_seq_file)
            seq_dict.update(self._create_seq_dict(in_seq_handle))
            in_seq_handle.close()
            logging.getLogger(__name__).info("Parsed fasta file: " +
                                             in_seq_file)

        for file_ctr, in_file in enumerate(ensembl_input_gtfs):
            in_handle = open(in_file)
            seq_dict_keys = seq_dict.keys()
            ctr = 0
            for rec in GFF.parse_simple(
                    in_file):  #(in_handle, base_dict=seq_dict):

                # transcript id seems to always be a list of length 1
                if len(rec['quals']['transcript_id']) > 1:
                    logging.getLogger(__name__).warn(
                        "ensembl records had more than one transcript id: " +
                        str(rec['quals']['transcript_id']))

                self._convertGFFRecordToTranscript(rec, seq_dict,
                                                   seq_dict_keys,
                                                   tx_to_protein_mapping)
                ctr += 1
                if (ctr % 10000) == 0:
                    logging.getLogger(__name__).info(
                        "Added " + str(ctr) + " lines of gtf " +
                        str(file_ctr + 1) + " of " +
                        str(len(ensembl_input_gtfs)) + " (" + in_file +
                        ") into internal transcript index.")
            in_handle.close()
            logging.getLogger(__name__).info("Finished " + str(ctr) +
                                             " lines of gtf (" + in_file + ")")

        logging.getLogger(__name__).info(
            "Populating final db with internal transcript index.")
        transcript_index_keys = self._transcript_index.keys()
        for i, k in enumerate(transcript_index_keys):

            # Populate the protein sequence
            protein_sequence = self._determine_protein_seq(
                self._transcript_index[k])
            self._transcript_index[k].set_protein_seq(protein_sequence)

            shove[k] = self._transcript_index[k]
            if i % 10000 == 0:
                logging.getLogger(__name__).info(
                    "Saved %0.1f%% of transcript index to disk with protein sequence."
                    % (float(i * 100) / float(len(transcript_index_keys))))

        logging.getLogger(__name__).info("Transcript index created " +
                                         str(len(shove.keys())) +
                                         " transcripts: " + protocol + "://" +
                                         output_filename)
        shove.close()