def remove_UTR_ensembl_exons (protein_id, species, exons): ''' Removes the untranslated regions from ensembl exons This is for the purpose of statistics generating ''' pc = ProteinContainer.Instance() dmc = DataMapContainer.Instance() dm_key = (protein_id, species) try: dm = dmc.get(dm_key) except KeyError: return None new_exons =[] for exon in exons: new_exon = EnsemblExon((exon.ref_protein_id, exon.species), exon.exon_id, exon.start, exon.stop, exon.strand, exon.sequence) new_exon.set_exon_ordinal(exon.ordinal) return new_exons
def load_exons (self): ''' Load the exons from the fasta file and create the dictionary mapping them by their Ensembl id. Exons are given appropriate ordinals. ''' data_map_container = DataMapContainer.Instance() logger = Logger.Instance() containers_logger = logger.get_logger('containers') data_map = data_map_container.get((self.ref_protein_id, self.species)) self.strand = data_map.strand fasta_path = self.get_exon_file_path() try: fasta = open(fasta_path, 'r') except IOError: containers_logger.error("%s,%s,%s" % (self.ref_protein_id, self.species, "Loading ensembl exons failed.")) return None fasta.close() exon_list = [] seq_records = read_seq_records_from_file(fasta_path, IUPAC.ambiguous_dna) for seq_record in seq_records: (start, stop, transcript_id, exon_id, strand) = seq_record.id.split('|') if (int(strand) == 1): self.strand = 1 exon = EnsemblExon((self.ref_protein_id, self.species), exon_id, start, stop, strand, seq_record.seq) else: self.strand = -1 exon = EnsemblExon((self.ref_protein_id, self.species), exon_id, stop, start, strand, seq_record.seq) exon_list.append(exon) fasta.close() self.exons = dict([(exon.exon_id, exon) for exon in exon_list]) # assign orinals to exons ordinal = 1 if self.strand == 1: for exon in sorted (self.exons.values(), key = lambda exon: exon.start ): exon.set_exon_ordinal(ordinal) ordinal += 1 else: for exon in sorted (self.exons.values(), key = lambda exon: exon.start, reverse = True): exon.set_exon_ordinal(ordinal) ordinal += 1 return exon_list