def build_ensembl_transcripts_by_genomic_location_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"): """Create an index for genomic position to transcripts index, using a transcript index created in build_ensembl_transcript_index """ transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname) output_db = Shove(protocol + "://" + output_filename, optimize=False) transcript_keys = transcript_db.keys() for i,tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] start = tx.get_start() end = tx.get_end() genomic_location_bin = region2bin(start, end) key = tx.get_contig() + "_" + str(genomic_location_bin) try: tmpList = output_db[key] except KeyError: output_db[key] = [] tmpList = output_db[key] tmpList.append(tx) output_db[key] = tmpList if (i+1) % 10000 == 0: logging.getLogger(__name__).info("Genomic position index added " + str(i) + " transcripts so far.") output_db.close() transcript_db.close()
def build_ensembl_transcripts_by_gene_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"): """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db :return: """ #TODO: This may need to be moved to the init of the transcript datasource as that may be faster. transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname, "memory://") output_db = Shove(protocol + "://" + output_filename, "memory://", optimize=False) transcript_keys = transcript_db.keys() for i,tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] gene = tx.get_gene() try: tmpList = output_db[gene] except KeyError: output_db[gene] = [] tmpList = output_db[gene] tmpList.append(tx) output_db[gene] = tmpList if (i+1) % 10000 == 0: logging.getLogger(__name__).info("Gene index added " + str(i) + " transcripts so far.") logging.getLogger(__name__).info("Finished gene index with " + str(len(output_db.keys())) + " genes.") output_db.close() transcript_db.close()
def build_ensembl_transcripts_by_genomic_location_index( self, ensembl_transcript_index_fname, output_filename, protocol="file"): """Create an index for genomic position to transcripts index, using a transcript index created in build_ensembl_transcript_index """ transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname) output_db = Shove(protocol + "://" + output_filename, optimize=False) transcript_keys = transcript_db.keys() for i, tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] start = tx.get_start() end = tx.get_end() genomic_location_bin = region2bin(start, end) key = tx.get_contig() + "_" + str(genomic_location_bin) try: tmpList = output_db[key] except KeyError: output_db[key] = [] tmpList = output_db[key] tmpList.append(tx) output_db[key] = tmpList if (i + 1) % 10000 == 0: logging.getLogger( __name__).info("Genomic position index added " + str(i) + " transcripts so far.") output_db.close() transcript_db.close()
def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None): """Create the transcript index (using shove) for ensembl. Key is transcript ID. Note: This method will hold the entire transcript index in RAM. :param ensembl_input_gtfs: (list) :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs :param output_filename: :param protocol: shove protocol. Usually "file" or "sqlite" """ # Example code taken from http://biopython.org/wiki/GFF_Parsing shove = Shove(protocol + "://" + output_filename, "memory://") logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename) # Get the transcript ID to protein ID mapping tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(protein_id_mapping_file) seq_dict = {} for in_seq_file in ensembl_input_fastas: in_seq_handle = open(in_seq_file) seq_dict.update(self._create_seq_dict(in_seq_handle)) in_seq_handle.close() logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file) for file_ctr, in_file in enumerate(ensembl_input_gtfs): in_handle = open(in_file) seq_dict_keys = seq_dict.keys() ctr = 0 for rec in GFF.parse_simple(in_file): #(in_handle, base_dict=seq_dict): # transcript id seems to always be a list of length 1 if len(rec['quals']['transcript_id']) > 1: logging.getLogger(__name__).warn("ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id'])) self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping) ctr += 1 if (ctr % 10000) == 0: logging.getLogger(__name__).info("Added " + str(ctr) + " lines of gtf " + str(file_ctr+1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.") in_handle.close() logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")") logging.getLogger(__name__).info("Populating final db with internal transcript index.") transcript_index_keys = self._transcript_index.keys() for i,k in enumerate(transcript_index_keys): # Populate the protein sequence protein_sequence = self._determine_protein_seq(self._transcript_index[k]) self._transcript_index[k].set_protein_seq(protein_sequence) shove[k] = self._transcript_index[k] if i % 10000 == 0: logging.getLogger(__name__).info("Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i*100)/float(len(transcript_index_keys)))) logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename) shove.close()
def build_ensembl_transcripts_by_gene_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"): """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db :return: """ #TODO: This may need to be moved to the init of the transcript datasource as that may be faster. transcript_db = Shove( protocol + "://" + ensembl_transcript_index_fname, "memory://") output_db = Shove(protocol + "://" + output_filename, "memory://", optimize=False) transcript_keys = transcript_db.keys() for i, tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] gene = tx.get_gene() try: tmpList = output_db[gene] except KeyError: output_db[gene] = [] tmpList = output_db[gene] tmpList.append(tx) output_db[gene] = tmpList if (i + 1) % 10000 == 0: logging.getLogger(__name__).info("Gene index added " + str(i) + " transcripts so far.") logging.getLogger(__name__).info("Finished gene index with " + str(len(output_db.keys())) + " genes.") output_db.close() transcript_db.close()
def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None): """Create the transcript index (using shove) for ensembl. Key is transcript ID. Note: This method will hold the entire transcript index in RAM. :param ensembl_input_gtfs: (list) :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs :param output_filename: :param protocol: shove protocol. Usually "file" or "sqlite" """ # Example code taken from http://biopython.org/wiki/GFF_Parsing shove = Shove(protocol + "://" + output_filename, "memory://") logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename) # Get the transcript ID to protein ID mapping tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping( protein_id_mapping_file) seq_dict = {} for in_seq_file in ensembl_input_fastas: in_seq_handle = open(in_seq_file) seq_dict.update(self._create_seq_dict(in_seq_handle)) in_seq_handle.close() logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file) for file_ctr, in_file in enumerate(ensembl_input_gtfs): in_handle = open(in_file) seq_dict_keys = seq_dict.keys() ctr = 0 for rec in GFF.parse_simple( in_file): #(in_handle, base_dict=seq_dict): # transcript id seems to always be a list of length 1 if len(rec['quals']['transcript_id']) > 1: logging.getLogger(__name__).warn( "ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id'])) self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping) ctr += 1 if (ctr % 10000) == 0: logging.getLogger(__name__).info( "Added " + str(ctr) + " lines of gtf " + str(file_ctr + 1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.") in_handle.close() logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")") logging.getLogger(__name__).info( "Populating final db with internal transcript index.") transcript_index_keys = self._transcript_index.keys() for i, k in enumerate(transcript_index_keys): # Populate the protein sequence protein_sequence = self._determine_protein_seq( self._transcript_index[k]) self._transcript_index[k].set_protein_seq(protein_sequence) shove[k] = self._transcript_index[k] if i % 10000 == 0: logging.getLogger(__name__).info( "Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i * 100) / float(len(transcript_index_keys)))) logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename) shove.close()