def test_build_ensembl_transcript_index(self): """Build the gtf portion of the ensembl transcript db """ # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g" # snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W, # # grep -Pzo ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa # ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" output_filename = "out/test_ensembl_gtf.db" protocol = "file" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol) self.assertTrue(os.path.exists(output_filename)) shove = Shove(protocol + "://" + output_filename, "memory://") self.assertTrue(len(shove.keys()) > 0) self.assertTrue("YDR529C" in shove.keys()) t = shove["YDR529C"] self.assertTrue(t.get_seq() is not None) self.assertTrue(t.get_seq() is not "") self.assertTrue(len(t.get_cds()) > 0) self.assertTrue(len(t.get_exons()) > 0) MutUtils.removeDir(output_filename)
def test_gencode_small(self): """Test that we can create Transcript instances from a small gencode gtf and fasta.""" gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v19.annotation.gtf" gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa" base_output_filename = "out/test_small_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["MAPK1"] self.assertTrue(len(transcripts) == 4) seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://", optimize=False) transcripts = seq_index_gp["22_753"] self.assertTrue(transcripts[0].get_strand() == "-") self.assertTrue(len(transcripts) == 1) for tx in transcripts: if tx.get_transcript_id() != "ENST00000215832.6": continue self.assertTrue(tx.get_seq().startswith("AGGCAATCGGTCCGAG"))
def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None): """Create the transcript index (using shove) for ensembl. Key is transcript ID. Note: This method will hold the entire transcript index in RAM. :param ensembl_input_gtfs: (list) :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs :param output_filename: :param protocol: shove protocol. Usually "file" or "sqlite" """ # Example code taken from http://biopython.org/wiki/GFF_Parsing shove = Shove(protocol + "://" + output_filename, "memory://") logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename) # Get the transcript ID to protein ID mapping tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(protein_id_mapping_file) seq_dict = {} for in_seq_file in ensembl_input_fastas: in_seq_handle = open(in_seq_file) seq_dict.update(self._create_seq_dict(in_seq_handle)) in_seq_handle.close() logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file) for file_ctr, in_file in enumerate(ensembl_input_gtfs): in_handle = open(in_file) seq_dict_keys = seq_dict.keys() ctr = 0 for rec in GFF.parse_simple(in_file): #(in_handle, base_dict=seq_dict): # transcript id seems to always be a list of length 1 if len(rec['quals']['transcript_id']) > 1: logging.getLogger(__name__).warn("ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id'])) self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping) ctr += 1 if (ctr % 10000) == 0: logging.getLogger(__name__).info("Added " + str(ctr) + " lines of gtf " + str(file_ctr+1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.") in_handle.close() logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")") logging.getLogger(__name__).info("Populating final db with internal transcript index.") transcript_index_keys = self._transcript_index.keys() for i,k in enumerate(transcript_index_keys): # Populate the protein sequence protein_sequence = self._determine_protein_seq(self._transcript_index[k]) self._transcript_index[k].set_protein_seq(protein_sequence) shove[k] = self._transcript_index[k] if i % 10000 == 0: logging.getLogger(__name__).info("Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i*100)/float(len(transcript_index_keys)))) logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename) shove.close()
def build_ensembl_transcripts_by_genomic_location_index( self, ensembl_transcript_index_fname, output_filename, protocol="file"): """Create an index for genomic position to transcripts index, using a transcript index created in build_ensembl_transcript_index """ transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname) output_db = Shove(protocol + "://" + output_filename, optimize=False) transcript_keys = transcript_db.keys() for i, tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] start = tx.get_start() end = tx.get_end() genomic_location_bin = region2bin(start, end) key = tx.get_contig() + "_" + str(genomic_location_bin) try: tmpList = output_db[key] except KeyError: output_db[key] = [] tmpList = output_db[key] tmpList.append(tx) output_db[key] = tmpList if (i + 1) % 10000 == 0: logging.getLogger( __name__).info("Genomic position index added " + str(i) + " transcripts so far.") output_db.close() transcript_db.close()
def build_ensembl_transcripts_by_gene_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"): """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db :return: """ #TODO: This may need to be moved to the init of the transcript datasource as that may be faster. transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname, "memory://") output_db = Shove(protocol + "://" + output_filename, "memory://", optimize=False) transcript_keys = transcript_db.keys() for i,tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] gene = tx.get_gene() try: tmpList = output_db[gene] except KeyError: output_db[gene] = [] tmpList = output_db[gene] tmpList.append(tx) output_db[gene] = tmpList if (i+1) % 10000 == 0: logging.getLogger(__name__).info("Gene index added " + str(i) + " transcripts so far.") logging.getLogger(__name__).info("Finished gene index with " + str(len(output_db.keys())) + " genes.") output_db.close() transcript_db.close()
def build_ensembl_transcripts_by_genomic_location_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"): """Create an index for genomic position to transcripts index, using a transcript index created in build_ensembl_transcript_index """ transcript_db = Shove(protocol + "://" + ensembl_transcript_index_fname) output_db = Shove(protocol + "://" + output_filename, optimize=False) transcript_keys = transcript_db.keys() for i,tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] start = tx.get_start() end = tx.get_end() genomic_location_bin = region2bin(start, end) key = tx.get_contig() + "_" + str(genomic_location_bin) try: tmpList = output_db[key] except KeyError: output_db[key] = [] tmpList = output_db[key] tmpList.append(tx) output_db[key] = tmpList if (i+1) % 10000 == 0: logging.getLogger(__name__).info("Genomic position index added " + str(i) + " transcripts so far.") output_db.close() transcript_db.close()
def parse_with_shove(fname, callableParsingFunction, pickleDir=""): ''' Pickle dir MUST include appended "/" ''' shoveFilename = pickleDir + os.path.basename(fname) + ".shv" if os.path.exists(shoveFilename): logging.getLogger(__name__).info("Loading shove structure: " + str(shoveFilename)) g = Shove("file://" + shoveFilename, "simple://") else: logging.getLogger(__name__).info("Parsing...") tmpStruct = callableParsingFunction(file(fname, 'r')) logging.getLogger(__name__).info("Writing shove db: " + str(shoveFilename)) ks = tmpStruct.keys() g = Shove("file://" + shoveFilename) for k in ks: del tmpStruct[ k].references # May be causing an error later down the road. g[k] = tmpStruct[k] return g
def test_retrieving_sequence(self): """Ensure we can retrieve a sequence from an ensembl transcript given a gene. """ ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" base_output_filename = "out/test_retrieving_full_indices_ensembl" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", optimize=False) transcripts = seq_index['SEO1'] transcript = transcripts[0] for i in xrange(len(transcripts)): transcript = transcripts[i] if transcript._transcript_id == "YAL067C": break self.assertTrue(transcript.get_seq().startswith('ATGTATTCAATTGTTAAAGAGATTATTGTAGATCCTTACAAAAGACTAAAATGGGGTTTT')) transcripts = seq_index['PAU8'] transcript = transcripts[0] for i in xrange(len(transcripts)): transcript = transcripts[i] if transcript._transcript_id == "YAL068C": break self.assertTrue(transcript.get_strand() == "-") seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://") transcripts = seq_index_gp["I_585"] self.assertTrue(len(transcripts) == 5, "There should be 5 transcripts.") transcript = transcripts[0] for i in xrange(len(transcripts)): transcript = transcripts[i] if transcript._transcript_id == "YAL069W": break self.assertTrue(transcript.get_strand() == "+")
def test_build_ensembl_transcripts_by_genomic_location_index(self): """Test that we can get an ensembl transcript from a genomic position""" protocol = "file" transcript_index_filename = "out/test_ensemble_gtf_for_gp.db" output_filename = "out/test_ensemble_gtf_for_gp.db.idx" shutil.rmtree(output_filename, ignore_errors=True) ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol) genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol) # Now load the index and look something up. gp_index = Shove(protocol + "://" + output_filename) gt_transcript_id = "YAL067C" bins = region2bins(1496172, 1496400) for bin in bins: key = 'I_' + str(bin) if key in gp_index.keys(): self.assertTrue(gp_index[key] == gt_transcript_id)
def build_ensembl_transcripts_by_gene_index(self, ensembl_transcript_index_fname, output_filename, protocol="file"): """ Create an index for gene --> transcripts using a transcript index created in build_ensembl_transcript_index :param ensembl_transcript_index_fname: file/dir location for ensembl transcript db :return: """ #TODO: This may need to be moved to the init of the transcript datasource as that may be faster. transcript_db = Shove( protocol + "://" + ensembl_transcript_index_fname, "memory://") output_db = Shove(protocol + "://" + output_filename, "memory://", optimize=False) transcript_keys = transcript_db.keys() for i, tx_id in enumerate(transcript_keys): tx = transcript_db[tx_id] gene = tx.get_gene() try: tmpList = output_db[gene] except KeyError: output_db[gene] = [] tmpList = output_db[gene] tmpList.append(tx) output_db[gene] = tmpList if (i + 1) % 10000 == 0: logging.getLogger(__name__).info("Gene index added " + str(i) + " transcripts so far.") logging.getLogger(__name__).info("Finished gene index with " + str(len(output_db.keys())) + " genes.") output_db.close() transcript_db.close()
def test_multiple_gtf_initialization(self): """Test that we can create a datasource from multiple gtf & fastas""" gencode_input_gtfs = ["testdata/gencode/CP.gencode.v19.annotation.gtf", "testdata/gencode/MAPK1.gencode.v19.annotation.gtf"] gencode_input_fastas = ["testdata/gencode/CP.gencode.v19.pc_transcripts.fa", "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa"] base_output_filename = "out/test_multi_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices(gencode_input_gtfs, gencode_input_fastas, base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["CP"] self.assertTrue(len(transcripts) == 15) transcripts = seq_index["MAPK1"] self.assertTrue(len(transcripts) == 4) for tx in transcripts: self.assertTrue(tx.get_transcript_id() == "ENST00000491588.1" or len(tx.get_seq()) > 100, "No seq data for " + tx.get_transcript_id() )
def test_build_ensembl_transcripts_by_gene_index(self): """Test building an index for getting a transcript given a gene.""" protocol = "file" transcript_index_filename = "out/test_ensembl_gtf_for_gene.db" output_filename = "out/test_ensembl_gtf_for_gene.db.gene.idx" shutil.rmtree(output_filename,ignore_errors=True) ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol) genome_build_factory.build_ensembl_transcripts_by_gene_index(transcript_index_filename, output_filename) # Now load the index and look something up. gene_index = Shove(protocol + "://" + output_filename, optimize=False) self.assertTrue(len(gene_index['SEO1']) == 1) tx = gene_index['SEO1'][0] self.assertTrue(tx.get_transcript_id()=="YAL067C")
def test_gencode_cp(self): """Test the indexing of a gene that was causing problems and make sure that it can be indexed.""" gencode_input_gtf = "testdata/gencode/CP.gencode.v19.annotation.gtf" gencode_input_fasta = "testdata/gencode/CP.gencode.v19.pc_transcripts.fa" base_output_filename = "out/test_cp_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["CP"] self.assertTrue(len(transcripts) == 15) troubled_transcript = "ENST00000474204.1" is_troubled_transcript_seen = False for tx in transcripts: if tx.get_transcript_id() == troubled_transcript: is_troubled_transcript_seen = True break self.assertTrue(is_troubled_transcript_seen)
def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None): """Create the transcript index (using shove) for ensembl. Key is transcript ID. Note: This method will hold the entire transcript index in RAM. :param ensembl_input_gtfs: (list) :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs :param output_filename: :param protocol: shove protocol. Usually "file" or "sqlite" """ # Example code taken from http://biopython.org/wiki/GFF_Parsing shove = Shove(protocol + "://" + output_filename, "memory://") logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename) # Get the transcript ID to protein ID mapping tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping( protein_id_mapping_file) seq_dict = {} for in_seq_file in ensembl_input_fastas: in_seq_handle = open(in_seq_file) seq_dict.update(self._create_seq_dict(in_seq_handle)) in_seq_handle.close() logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file) for file_ctr, in_file in enumerate(ensembl_input_gtfs): in_handle = open(in_file) seq_dict_keys = seq_dict.keys() ctr = 0 for rec in GFF.parse_simple( in_file): #(in_handle, base_dict=seq_dict): # transcript id seems to always be a list of length 1 if len(rec['quals']['transcript_id']) > 1: logging.getLogger(__name__).warn( "ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id'])) self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping) ctr += 1 if (ctr % 10000) == 0: logging.getLogger(__name__).info( "Added " + str(ctr) + " lines of gtf " + str(file_ctr + 1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.") in_handle.close() logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")") logging.getLogger(__name__).info( "Populating final db with internal transcript index.") transcript_index_keys = self._transcript_index.keys() for i, k in enumerate(transcript_index_keys): # Populate the protein sequence protein_sequence = self._determine_protein_seq( self._transcript_index[k]) self._transcript_index[k].set_protein_seq(protein_sequence) shove[k] = self._transcript_index[k] if i % 10000 == 0: logging.getLogger(__name__).info( "Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i * 100) / float(len(transcript_index_keys)))) logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename) shove.close()