def t_key_whitespace(self): """Fix keys with problematic whitespace. """ tfile = os.path.join(self._test_dir, "spaces.gff3") for i, line_info in enumerate(GFF.parse_simple(tfile)): if i > 2: assert line_info["quals"]["foo"] == ["bar"]
def t_simple_parsing_nesting(self): """Simple parsing for lines with nesting, using the simplified API. """ test_gff = os.path.join(self._test_dir, "transcripts.gff3") num_lines = 0 for line_info in GFF.parse_simple(test_gff): num_lines += 1 assert num_lines == 16, num_lines
def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None): """Create the transcript index (using shove) for ensembl. Key is transcript ID. Note: This method will hold the entire transcript index in RAM. :param ensembl_input_gtfs: (list) :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs :param output_filename: :param protocol: shove protocol. Usually "file" or "sqlite" """ # Example code taken from http://biopython.org/wiki/GFF_Parsing shove = Shove(protocol + "://" + output_filename, "memory://") logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename) # Get the transcript ID to protein ID mapping tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping(protein_id_mapping_file) seq_dict = {} for in_seq_file in ensembl_input_fastas: in_seq_handle = open(in_seq_file) seq_dict.update(self._create_seq_dict(in_seq_handle)) in_seq_handle.close() logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file) for file_ctr, in_file in enumerate(ensembl_input_gtfs): in_handle = open(in_file) seq_dict_keys = seq_dict.keys() ctr = 0 for rec in GFF.parse_simple(in_file): #(in_handle, base_dict=seq_dict): # transcript id seems to always be a list of length 1 if len(rec['quals']['transcript_id']) > 1: logging.getLogger(__name__).warn("ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id'])) self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping) ctr += 1 if (ctr % 10000) == 0: logging.getLogger(__name__).info("Added " + str(ctr) + " lines of gtf " + str(file_ctr+1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.") in_handle.close() logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")") logging.getLogger(__name__).info("Populating final db with internal transcript index.") transcript_index_keys = self._transcript_index.keys() for i,k in enumerate(transcript_index_keys): # Populate the protein sequence protein_sequence = self._determine_protein_seq(self._transcript_index[k]) self._transcript_index[k].set_protein_seq(protein_sequence) shove[k] = self._transcript_index[k] if i % 10000 == 0: logging.getLogger(__name__).info("Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i*100)/float(len(transcript_index_keys)))) logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename) shove.close()
def parse_tx_in_region(tx_file, region): want = ["gene_id", "transcript_id", "FPKM"] se_range = set(range(region["start"], region["end"])) limit_info = {"gff_id": [region["space"]], "gff_type": ["transcript"]} for rec in GFF.parse_simple(tx_file, limit_info=limit_info): s, e = rec["location"] if s in se_range or e in se_range: out = {"chr": rec["rec_id"], "start": s, "end": e} for n in want: out[n] = rec["quals"][n][0] yield out
def build_ensembl_transcript_index(self, ensembl_input_gtfs, ensembl_input_fastas, output_filename, protocol="file", protein_id_mapping_file=None): """Create the transcript index (using shove) for ensembl. Key is transcript ID. Note: This method will hold the entire transcript index in RAM. :param ensembl_input_gtfs: (list) :param ensembl_input_fastas: (list) sequence data for transcripts corresponding to what is in the gtfs :param output_filename: :param protocol: shove protocol. Usually "file" or "sqlite" """ # Example code taken from http://biopython.org/wiki/GFF_Parsing shove = Shove(protocol + "://" + output_filename, "memory://") logging.getLogger(__name__).info("Transcript index being created: " + protocol + "://" + output_filename) # Get the transcript ID to protein ID mapping tx_to_protein_mapping = self._create_tx_id_to_protein_id_mapping( protein_id_mapping_file) seq_dict = {} for in_seq_file in ensembl_input_fastas: in_seq_handle = open(in_seq_file) seq_dict.update(self._create_seq_dict(in_seq_handle)) in_seq_handle.close() logging.getLogger(__name__).info("Parsed fasta file: " + in_seq_file) for file_ctr, in_file in enumerate(ensembl_input_gtfs): in_handle = open(in_file) seq_dict_keys = seq_dict.keys() ctr = 0 for rec in GFF.parse_simple( in_file): #(in_handle, base_dict=seq_dict): # transcript id seems to always be a list of length 1 if len(rec['quals']['transcript_id']) > 1: logging.getLogger(__name__).warn( "ensembl records had more than one transcript id: " + str(rec['quals']['transcript_id'])) self._convertGFFRecordToTranscript(rec, seq_dict, seq_dict_keys, tx_to_protein_mapping) ctr += 1 if (ctr % 10000) == 0: logging.getLogger(__name__).info( "Added " + str(ctr) + " lines of gtf " + str(file_ctr + 1) + " of " + str(len(ensembl_input_gtfs)) + " (" + in_file + ") into internal transcript index.") in_handle.close() logging.getLogger(__name__).info("Finished " + str(ctr) + " lines of gtf (" + in_file + ")") logging.getLogger(__name__).info( "Populating final db with internal transcript index.") transcript_index_keys = self._transcript_index.keys() for i, k in enumerate(transcript_index_keys): # Populate the protein sequence protein_sequence = self._determine_protein_seq( self._transcript_index[k]) self._transcript_index[k].set_protein_seq(protein_sequence) shove[k] = self._transcript_index[k] if i % 10000 == 0: logging.getLogger(__name__).info( "Saved %0.1f%% of transcript index to disk with protein sequence." % (float(i * 100) / float(len(transcript_index_keys)))) logging.getLogger(__name__).info("Transcript index created " + str(len(shove.keys())) + " transcripts: " + protocol + "://" + output_filename) shove.close()